diff --git a/OpenCL/m01500_a3.cl b/OpenCL/m01500_a3.cl index 5db332bf5..b26dc4c57 100644 --- a/OpenCL/m01500_a3.cl +++ b/OpenCL/m01500_a3.cl @@ -890,11 +890,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC - * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. + * Bitslice DES S-boxes making use of a vector conditional select operation + * (e.g., vsel on PowerPC with AltiVec). * - * Gate counts: 49 44 46 33 48 46 46 41 - * Average: 44.125 + * Gate counts: 36 33 33 26 35 34 34 32 + * Average: 32.875 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -913,556 +913,469 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ +#define vnot(d,a) (d) = ~(a) +#define vor(d,a,b) (d) = (a) | (b) +#define vxor(d,a,b) (d) = (a) ^ (b) +#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) + void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, - x25202160; - u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; - u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; - u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; - u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; - u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; - u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; - u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; + u32 x55AFD1B7, x3C3C69C3, x6993B874; + u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; + u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; + u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; + u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; + u32 x0DBCE883, x3A25A215, x37994A96; + u32 xC9C93B62, x89490F02, xB96C2D16; + u32 x0, x1, x2, x3; - x55005500 = a1 & ~a5; - x5A0F5A0F = a4 ^ x55005500; - x3333FFFF = a3 | a6; - x66666666 = a1 ^ a3; - x22226666 = x3333FFFF & x66666666; - x2D2D6969 = a4 ^ x22226666; - x25202160 = x2D2D6969 & ~x5A0F5A0F; + vsel(x0F0F3333, a3, a2, a5); + vxor(x3C3C3C3C, a2, a3); + vor(x55FF55FF, a1, a4); + vxor(x69C369C3, x3C3C3C3C, x55FF55FF); + vsel(x0903B73F, a5, x0F0F3333, x69C369C3); + vxor(x09FCB7C0, a4, x0903B73F); + vxor(x5CA9E295, a1, x09FCB7C0); - x00FFFF00 = a5 ^ a6; - x33CCCC33 = a3 ^ x00FFFF00; - x4803120C = x5A0F5A0F & ~x33CCCC33; - x2222FFFF = a6 | x22226666; - x6A21EDF3 = x4803120C ^ x2222FFFF; - x4A01CC93 = x6A21EDF3 & ~x25202160; + vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); + vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); + vxor(x6993B874, x55AFD1B7, x3C3C69C3); - x5555FFFF = a1 | a6; - x7F75FFFF = x6A21EDF3 | x5555FFFF; - x00D20096 = a5 & ~x2D2D6969; - x7FA7FF69 = x7F75FFFF ^ x00D20096; + vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); + vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); + vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); + vxor(x529E962D, x0F0F3333, x5D91A51E); - x0A0A0000 = a4 & ~x5555FFFF; - x0AD80096 = x00D20096 ^ x0A0A0000; - x00999900 = x00FFFF00 & ~x66666666; - x0AD99996 = x0AD80096 | x00999900; + vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); + vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); + vsel(x428679F3, a5, x4B8771A3, x529E962D); + vxor(x6B68D433, x29EEADC0, x428679F3); - x22332233 = a3 & ~x55005500; - x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; - x054885C0 = x257AA5F0 & ~x22332233; - xFAB77A3F = ~x054885C0; - x2221EDF3 = x3333FFFF & x6A21EDF3; - xD89697CC = xFAB77A3F ^ x2221EDF3; - x20 = x7FA7FF69 & ~a2; - x21 = x20 ^ xD89697CC; - *out3 ^= x21; + vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); + vsel(x026F12F3, a4, x0F0F3333, x529E962D); + vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); + vnot(x94D83B6C, x6B27C493); + vsel(x0, x94D83B6C, x6B68D433, a6); + vxor(*out1, *out1, x0); - x05B77AC0 = x00FFFF00 ^ x054885C0; - x05F77AD6 = x00D20096 | x05B77AC0; - x36C48529 = x3333FFFF ^ x05F77AD6; - x6391D07C = a1 ^ x36C48529; - xBB0747B0 = xD89697CC ^ x6391D07C; - x00 = x25202160 | a2; - x01 = x00 ^ xBB0747B0; - *out1 ^= x01; + vsel(x965E0B0F, x94D83B6C, a3, x428679F3); + vsel(x3327A113, x5BA7E193, a2, x69C369C3); + vsel(x847F0A1F, x965E0B0F, a4, x3327A113); + vxor(xD6E19C32, x529E962D, x847F0A1F); + vsel(x1, xD6E19C32, x5CA9E295, a6); + vxor(*out2, *out2, x1); - x4C460000 = x3333FFFF ^ x7F75FFFF; - x4EDF9996 = x0AD99996 | x4C460000; - x2D4E49EA = x6391D07C ^ x4EDF9996; - xBBFFFFB0 = x00FFFF00 | xBB0747B0; - x96B1B65A = x2D4E49EA ^ xBBFFFFB0; - x10 = x4A01CC93 | a2; - x11 = x10 ^ x96B1B65A; - *out2 ^= x11; + vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); + vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); + vxor(x37994A96, x0DBCE883, x3A25A215); + vsel(x3, x37994A96, x529E962D, a6); + vxor(*out4, *out4, x3); - x5AFF5AFF = a5 | x5A0F5A0F; - x52B11215 = x5AFF5AFF & ~x2D4E49EA; - x4201C010 = x4A01CC93 & x6391D07C; - x10B0D205 = x52B11215 ^ x4201C010; - x30 = x10B0D205 | a2; - x31 = x30 ^ x0AD99996; - *out4 ^= x31; + vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); + vsel(x89490F02, a3, xC9C93B62, x965E0B0F); + vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); + vsel(x2, xB96C2D16, x6993B874, a6); + vxor(*out3, *out3, x2); } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x33CC33CC; - u32 x55550000, x00AA00FF, x33BB33FF; - u32 x33CC0000, x11441144, x11BB11BB, x003311BB; - u32 x00000F0F, x336600FF, x332200FF, x332200F0; - u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; - u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; - u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; - u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; - u32 x0A451047, xBBDFDD7B, xB19ACD3C; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; + u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; + u32 x0F5AF03C, x6600FF56, x87A5F09C; + u32 xA55A963C, x3C69C30F, xB44BC32D; + u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; + u32 xB46C662D, x278DB412, xB66CB43B; + u32 xD2DC4E52, x27993333, xD2994E33; + u32 x278D0F2D, x2E0E547B, x09976748; + u32 x0, x1, x2, x3; - x33CC33CC = a2 ^ a5; + vsel(x55553333, a1, a3, a6); + vsel(x0055FF33, a6, x55553333, a5); + vsel(x33270F03, a3, a4, x0055FF33); + vxor(x66725A56, a1, x33270F03); + vxor(x00FFFF00, a5, a6); + vxor(x668DA556, x66725A56, x00FFFF00); - x55550000 = a1 & ~a6; - x00AA00FF = a5 & ~x55550000; - x33BB33FF = a2 | x00AA00FF; + vsel(x0F0F5A56, a4, x66725A56, a6); + vnot(xF0F0A5A9, x0F0F5A56); + vxor(xA5A5969A, x55553333, xF0F0A5A9); + vxor(xA55A699A, x00FFFF00, xA5A5969A); + vsel(x1, xA55A699A, x668DA556, a2); + vxor(*out2, *out2, x1); - x33CC0000 = x33CC33CC & ~a6; - x11441144 = a1 & x33CC33CC; - x11BB11BB = a5 ^ x11441144; - x003311BB = x11BB11BB & ~x33CC0000; + vxor(x0F5AF03C, a4, x0055FF33); + vsel(x6600FF56, x66725A56, a6, x00FFFF00); + vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); - x00000F0F = a3 & a6; - x336600FF = x00AA00FF ^ x33CC0000; - x332200FF = x33BB33FF & x336600FF; - x332200F0 = x332200FF & ~x00000F0F; + vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); + vxor(x3C69C30F, a3, x0F5AF03C); + vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); - x0302000F = a3 & x332200FF; - xAAAAAAAA = ~a1; - xA9A8AAA5 = x0302000F ^ xAAAAAAAA; - x33CCCC33 = a6 ^ x33CC33CC; - x33CCC030 = x33CCCC33 & ~x00000F0F; - x9A646A95 = xA9A8AAA5 ^ x33CCC030; - x10 = a4 & ~x332200F0; - x11 = x10 ^ x9A646A95; - *out2 ^= x11; + vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); + vsel(x0F4B0F2D, a4, xB44BC32D, a5); + vxor(x699CC37B, x66D7CC56, x0F4B0F2D); + vxor(x996C66D2, xF0F0A5A9, x699CC37B); + vsel(x0, x996C66D2, xB44BC32D, a2); + vxor(*out1, *out1, x0); - x00333303 = a2 & ~x33CCC030; - x118822B8 = x11BB11BB ^ x00333303; - xA8208805 = xA9A8AAA5 & ~x118822B8; - x3CC3C33C = a3 ^ x33CCCC33; - x94E34B39 = xA8208805 ^ x3CC3C33C; - x00 = x33BB33FF & ~a4; - x01 = x00 ^ x94E34B39; - *out1 ^= x01; + vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); + vsel(x278DB412, x668DA556, xA5A5969A, a1); + vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); - x0331330C = x0302000F ^ x00333303; - x3FF3F33C = x3CC3C33C | x0331330C; - xA9DF596A = x33BB33FF ^ x9A646A95; - xA9DF5F6F = x00000F0F | xA9DF596A; - x962CAC53 = x3FF3F33C ^ xA9DF5F6F; + vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); + vsel(x27993333, x278DB412, a3, x0055FF33); + vsel(xD2994E33, xD2DC4E52, x27993333, a5); + vsel(x3, x87A5F09C, xD2994E33, a2); + vxor(*out4, *out4, x3); - xA9466A6A = x332200FF ^ x9A646A95; - x3DA52153 = x94E34B39 ^ xA9466A6A; - x29850143 = xA9DF5F6F & x3DA52153; - x33C0330C = x33CC33CC & x3FF3F33C; - x1A45324F = x29850143 ^ x33C0330C; - x20 = x1A45324F | a4; - x21 = x20 ^ x962CAC53; - *out3 ^= x21; - - x0A451047 = x1A45324F & ~x118822B8; - xBBDFDD7B = x33CCCC33 | xA9DF596A; - xB19ACD3C = x0A451047 ^ xBBDFDD7B; - x30 = x003311BB | a4; - x31 = x30 ^ xB19ACD3C; - *out4 ^= x31; + vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); + vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); + vxor(x09976748, x27993333, x2E0E547B); + vsel(x2, xB66CB43B, x09976748, a2); + vxor(*out3, *out3, x2); } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; - u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; - u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; - u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; - u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; - u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; - u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; - u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0F330F33, x0F33F0CC, x5A66A599; + u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; + u32 x556BA09E, x665A93AC, x99A56C53; + u32 x25A1A797, x5713754C, x66559355, x47B135C6; + u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; + u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; + u32 xD069F8B4, x030FF0C3, xD2699876; + u32 xD579DDF4, xD579F0C3, xB32C6396; + u32 x0, x1, x2, x3; - x44444444 = a1 & ~a2; - x0F0FF0F0 = a3 ^ a6; - x4F4FF4F4 = x44444444 | x0F0FF0F0; - x00FFFF00 = a4 ^ a6; - x00AAAA00 = x00FFFF00 & ~a1; - x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; + vsel(x0F330F33, a4, a3, a5); + vxor(x0F33F0CC, a6, x0F330F33); + vxor(x5A66A599, a2, x0F33F0CC); - x3C3CC3C3 = a2 ^ x0F0FF0F0; - x3C3C0000 = x3C3CC3C3 & ~a6; - x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; - x0C840A00 = x4FE55EF4 & ~x7373F4F4; + vsel(x2111B7BB, a3, a6, x5A66A599); + vsel(x03FF3033, a5, a3, x0F33F0CC); + vsel(x05BB50EE, a5, x0F33F0CC, a2); + vsel(x074F201F, x03FF3033, a4, x05BB50EE); + vxor(x265E97A4, x2111B7BB, x074F201F); - x00005EF4 = a6 & x4FE55EF4; - x00FF5EFF = a4 | x00005EF4; - x00555455 = a1 & x00FF5EFF; - x3C699796 = x3C3CC3C3 ^ x00555455; - x30 = x4FE55EF4 & ~a5; - x31 = x30 ^ x3C699796; - *out4 ^= x31; + vsel(x556BA09E, x5A66A599, x05BB50EE, a4); + vsel(x665A93AC, x556BA09E, x265E97A4, a3); + vnot(x99A56C53, x665A93AC); + vsel(x1, x265E97A4, x99A56C53, a1); + vxor(*out2, *out2, x1); - x000FF000 = x0F0FF0F0 & x00FFFF00; - x55AA55AA = a1 ^ a4; - x26D9A15E = x7373F4F4 ^ x55AA55AA; - x2FDFAF5F = a3 | x26D9A15E; - x2FD00F5F = x2FDFAF5F & ~x000FF000; + vxor(x25A1A797, x03FF3033, x265E97A4); + vsel(x5713754C, a2, x0F33F0CC, x074F201F); + vsel(x66559355, x665A93AC, a2, a5); + vsel(x47B135C6, x25A1A797, x5713754C, x66559355); - x55AAFFAA = x00AAAA00 | x55AA55AA; - x28410014 = x3C699796 & ~x55AAFFAA; - x000000FF = a4 & a6; - x000000CC = x000000FF & ~a2; - x284100D8 = x28410014 ^ x000000CC; + vxor(x9A5A5C60, x03FF3033, x99A56C53); + vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); + vxor(x87698DB4, x5713754C, xD07AF8F8); + vxor(xE13C1EE1, x66559355, x87698DB4); - x204100D0 = x7373F4F4 & x284100D8; - x3C3CC3FF = x3C3CC3C3 | x000000FF; - x1C3CC32F = x3C3CC3FF & ~x204100D0; - x4969967A = a1 ^ x1C3CC32F; - x10 = x2FD00F5F & a5; - x11 = x10 ^ x4969967A; - *out2 ^= x11; + vsel(x000CFFCF, a4, a6, x0F33F0CC); + vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); + vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); + vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); + vsel(x0, x9E49915E, xE13C1EE1, a1); + vxor(*out1, *out1, x0); - x4CC44CC4 = x4FE55EF4 & ~a2; - x40C040C0 = x4CC44CC4 & ~a3; - xC3C33C3C = ~x3C3CC3C3; - x9669C396 = x55AAFFAA ^ xC3C33C3C; - xD6A98356 = x40C040C0 ^ x9669C396; - x00 = a5 & ~x0C840A00; - x01 = x00 ^ xD6A98356; - *out1 ^= x01; + vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); + vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); + vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); + vsel(x3, x5A66A599, xD2699876, a1); + vxor(*out4, *out4, x3); - xD6E9C3D6 = x40C040C0 | x9669C396; - x4CEEEEC4 = x00AAAA00 | x4CC44CC4; - x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; - x001A000B = a4 & ~x4FE55EF4; - x9A1F2D1B = x9A072D12 | x001A000B; - x20 = a5 & ~x284100D8; - x21 = x20 ^ x9A1F2D1B; - *out3 ^= x21; + vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); + vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); + vxor(xB32C6396, x66559355, xD579F0C3); + vsel(x2, xB32C6396, x47B135C6, a1); + vxor(*out3, *out3, x2); } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x5A5A5A5A, x0F0FF0F0; - u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, - x52FBCA0F, x61C8F93C; - u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; - u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; - u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, + x0AF50F0F, x4CA36B59; + u32 xB35C94A6; + u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; + u32 x56E9861E; + u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; + u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; + u32 x0, x1, x2, x3; - x5A5A5A5A = a1 ^ a3; - x0F0FF0F0 = a3 ^ a5; - x33FF33FF = a2 | a4; - x33FFCC00 = a5 ^ x33FF33FF; - x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; - x0C0CC0C0 = x0F0FF0F0 & ~a2; - x0CF3C03F = a4 ^ x0C0CC0C0; - x5EFBDA7F = x5A5A5A5A | x0CF3C03F; - x52FBCA0F = x5EFBDA7F & ~x0C0030F0; - x61C8F93C = a2 ^ x52FBCA0F; + vsel(x0505AFAF, a5, a3, a1); + vsel(x0555AF55, x0505AFAF, a1, a4); + vxor(x0A5AA05A, a3, x0555AF55); + vsel(x46566456, a1, x0A5AA05A, a2); + vsel(x0A0A5F5F, a3, a5, a1); + vxor(x0AF55FA0, a4, x0A0A5F5F); + vsel(x0AF50F0F, x0AF55FA0, a3, a5); + vxor(x4CA36B59, x46566456, x0AF50F0F); - x00C0C03C = x0CF3C03F & x61C8F93C; - x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; - x3B92A366 = x5A5A5A5A ^ x61C8F93C; - x30908326 = x3B92A366 & ~x0F0F30C0; - x3C90B3D6 = x0C0030F0 ^ x30908326; + vnot(xB35C94A6, x4CA36B59); - x33CC33CC = a2 ^ a4; - x0C0CFFFF = a5 | x0C0CC0C0; - x379E5C99 = x3B92A366 ^ x0C0CFFFF; - x04124C11 = x379E5C99 & ~x33CC33CC; - x56E9861E = x52FBCA0F ^ x04124C11; - x00 = a6 & ~x3C90B3D6; - x01 = x00 ^ x56E9861E; - *out1 ^= x01; + vsel(x01BB23BB, a4, a2, x0555AF55); + vxor(x5050FAFA, a1, x0505AFAF); + vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); + vxor(xA91679E1, x0A0A5F5F, xA31C26BE); - xA91679E1 = ~x56E9861E; - x10 = x3C90B3D6 & ~a6; - x11 = x10 ^ xA91679E1; - *out2 ^= x11; + vnot(x56E9861E, xA91679E1); - x9586CA37 = x3C90B3D6 ^ xA91679E1; - x8402C833 = x9586CA37 & ~x33CC33CC; - x84C2C83F = x00C0C03C | x8402C833; - xB35C94A6 = x379E5C99 ^ x84C2C83F; - x20 = x61C8F93C | a6; - x21 = x20 ^ xB35C94A6; - *out3 ^= x21; + vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); + vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); + vsel(x827D9784, xB35C94A6, x0AF55F00, a2); + vxor(xD2946D9A, x50E9FA1E, x827D9784); + vsel(x2, xD2946D9A, x4CA36B59, a6); + vxor(*out3, *out3, x2); + vsel(x3, xB35C94A6, xD2946D9A, a6); + vxor(*out4, *out4, x3); - x30 = a6 & x61C8F93C; - x31 = x30 ^ xB35C94A6; - *out4 ^= x31; + vsel(x31F720B3, a2, a4, x0AF55FA0); + vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); + vxor(x4712A7AD, x56E9861E, x11FB21B3); + vxor(x9586CA37, xD2946D9A, x4712A7AD); + vsel(x0, x56E9861E, x9586CA37, a6); + vxor(*out1, *out1, x0); + vsel(x1, x9586CA37, xA91679E1, a6); + vxor(*out2, *out2, x1); } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; - u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; - u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; - u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; - u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; - u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; - u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; - u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; + u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; + u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; + u32 xAC81CFB2, xF72577AF, x5BA4B81D; + u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; + u32 x12E6283D, x9E47D3D4, x1A676AB4; + u32 x891556DF, xE5E77F82, x6CF2295D; + u32 x2E3CA5F5, x9697C1C6, x369CC1D6; + u32 x0, x1, x2, x3; - x77777777 = a1 | a3; - x77770000 = x77777777 & ~a6; - x22225555 = a1 ^ x77770000; - x11116666 = a3 ^ x22225555; - x1F1F6F6F = a4 | x11116666; + vsel(x550F550F, a1, a3, a5); + vnot(xAAF0AAF0, x550F550F); + vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); + vxor(x96C696C6, a2, xA5F5A5F5); + vxor(x00FFFF00, a5, a6); + vxor(x963969C6, x96C696C6, x00FFFF00); - x70700000 = x77770000 & ~a4; - x43433333 = a3 ^ x70700000; - x00430033 = a5 & x43433333; - x55557777 = a1 | x11116666; - x55167744 = x00430033 ^ x55557777; - x5A19784B = a4 ^ x55167744; + vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); + vsel(xB73121F7, a2, x963969C6, x96C696C6); + vsel(x1501DF0F, a6, x550F550F, xB73121F7); + vsel(x00558A5F, x1501DF0F, a5, a1); + vxor(x2E69A463, x2E3C2E3C, x00558A5F); - x5A1987B4 = a6 ^ x5A19784B; - x7A3BD7F5 = x22225555 | x5A1987B4; - x003B00F5 = a5 & x7A3BD7F5; - x221955A0 = x22225555 ^ x003B00F5; - x05050707 = a4 & x55557777; - x271C52A7 = x221955A0 ^ x05050707; + vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); + vsel(x045157FD, a6, a1, x0679ED42); + vsel(xB32077FF, xB73121F7, a6, x045157FD); + vxor(x9D49D39C, x2E69A463, xB32077FF); + vsel(x2, x9D49D39C, x2E69A463, a4); + vxor(*out3, *out3, x2); - x2A2A82A0 = x7A3BD7F5 & ~a1; - x6969B193 = x43433333 ^ x2A2A82A0; - x1FE06F90 = a5 ^ x1F1F6F6F; - x16804E00 = x1FE06F90 & ~x6969B193; - xE97FB1FF = ~x16804E00; - x20 = xE97FB1FF & ~a2; - x21 = x20 ^ x5A19784B; - *out3 ^= x21; + vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); + vsel(xF72577AF, xB32077FF, x550F550F, a1); + vxor(x5BA4B81D, xAC81CFB2, xF72577AF); + vsel(x1, x5BA4B81D, x963969C6, a4); + vxor(*out2, *out2, x1); - x43403302 = x43433333 & ~x003B00F5; - x35CAED30 = x2A2A82A0 ^ x1FE06F90; - x37DEFFB7 = x271C52A7 | x35CAED30; - x349ECCB5 = x37DEFFB7 & ~x43403302; - x0B01234A = x1F1F6F6F & ~x349ECCB5; + vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); + vsel(x4895469F, x5BA477AF, x00558A5F, a2); + vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); + vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); - x101884B4 = x5A1987B4 & x349ECCB5; - x0FF8EB24 = x1FE06F90 ^ x101884B4; - x41413333 = x43433333 & x55557777; - x4FF9FB37 = x0FF8EB24 | x41413333; - x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; - x30 = x4FC2FBC2 & a2; - x31 = x30 ^ x271C52A7; - *out4 ^= x31; + vsel(x12E6283D, a5, x5BA4B81D, x963969C6); + vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); + vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); - x22222222 = a1 ^ x77777777; - x16BCEE97 = x349ECCB5 ^ x22222222; - x0F080B04 = a4 & x0FF8EB24; - x19B4E593 = x16BCEE97 ^ x0F080B04; - x00 = x0B01234A | a2; - x01 = x00 ^ x19B4E593; - *out1 ^= x01; + vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); + vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); + vxor(x6CF2295D, x891556DF, xE5E77F82); + vsel(x3, x1A35669A, x6CF2295D, a4); + vxor(*out4, *out4, x3); - x5C5C5C5C = x1F1F6F6F ^ x43433333; - x4448184C = x5C5C5C5C & ~x19B4E593; - x2DDABE71 = x22225555 ^ x0FF8EB24; - x6992A63D = x4448184C ^ x2DDABE71; - x10 = x1F1F6F6F & a2; - x11 = x10 ^ x6992A63D; - *out2 ^= x11; + vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); + vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); + vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); + vsel(x0, x369CC1D6, x1A676AB4, a4); + vxor(*out1, *out1, x0); } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x33CC33CC; - u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; - u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; - u32 x09030C06, x09030000, x336622FF, x3A6522FF; - u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; - u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; - u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; - u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; - u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; + u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; + u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; + u32 x86CD4C9B, x12E0FFFD, x942D9A67; + u32 x142956AB, x455D45DF, x1C3EE619; + u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; + u32 x840DBB67, x6DA19C1E, x925E63E1; + u32 x9C3CA761, x257A75D5, xB946D2B4; + u32 x0, x1, x2, x3; - x33CC33CC = a2 ^ a5; + vsel(x555500FF, a1, a4, a5); + vxor(x666633CC, a2, x555500FF); + vsel(x606F30CF, x666633CC, a4, a3); + vxor(x353A659A, a1, x606F30CF); + vxor(x353A9A65, a5, x353A659A); + vnot(xCAC5659A, x353A9A65); - x3333FFFF = a2 | a6; - x11115555 = a1 & x3333FFFF; - x22DD6699 = x33CC33CC ^ x11115555; - x22DD9966 = a6 ^ x22DD6699; - x00220099 = a5 & ~x22DD9966; + vsel(x353A6565, x353A659A, x353A9A65, a4); + vsel(x0A3F0A6F, a3, a4, x353A6565); + vxor(x6C5939A3, x666633CC, x0A3F0A6F); + vxor(x5963A3C6, x353A9A65, x6C5939A3); - x00551144 = a1 & x22DD9966; - x33662277 = a2 ^ x00551144; - x5A5A5A5A = a1 ^ a3; - x7B7E7A7F = x33662277 | x5A5A5A5A; - x59A31CE6 = x22DD6699 ^ x7B7E7A7F; + vsel(x35FF659A, a4, x353A659A, x353A6565); + vxor(x3AF06A95, a3, x35FF659A); + vsel(x05CF0A9F, a4, a3, x353A9A65); + vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); - x09030C06 = a3 & x59A31CE6; - x09030000 = x09030C06 & ~a6; - x336622FF = x00220099 | x33662277; - x3A6522FF = x09030000 ^ x336622FF; - x30 = x3A6522FF & a4; - x31 = x30 ^ x59A31CE6; - *out4 ^= x31; + vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); + vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); + vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); + vsel(x0, xCAC5659A, x942D9A67, a6); + vxor(*out1, *out1, x0); - x484D494C = a2 ^ x7B7E7A7F; - x0000B6B3 = a6 & ~x484D494C; - x0F0FB9BC = a3 ^ x0000B6B3; - x00FC00F9 = a5 & ~x09030C06; - x0FFFB9FD = x0F0FB9BC | x00FC00F9; + vsel(x142956AB, x353A659A, x942D9A67, a2); + vsel(x455D45DF, a1, x86CD4C9B, x142956AB); + vxor(x1C3EE619, x5963A3C6, x455D45DF); + vsel(x3, x5963A3C6, x1C3EE619, a6); + vxor(*out4, *out4, x3); - x5DF75DF7 = a1 | x59A31CE6; - x116600F7 = x336622FF & x5DF75DF7; - x1E69B94B = x0F0FB9BC ^ x116600F7; - x1668B94B = x1E69B94B & ~x09030000; - x20 = x00220099 | a4; - x21 = x20 ^ x1668B94B; - *out3 ^= x21; + vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); + vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); + vxor(x3CF19C86, x1C3EE619, x20CF7A9F); + vxor(x69A49C79, x555500FF, x3CF19C86); - x7B7B7B7B = a2 | x5A5A5A5A; - x411E5984 = x3A6522FF ^ x7B7B7B7B; - x1FFFFDFD = x11115555 | x0FFFB9FD; - x5EE1A479 = x411E5984 ^ x1FFFFDFD; + vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); + vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); + vnot(x925E63E1, x6DA19C1E); + vsel(x1, x925E63E1, x69A49C79, a6); + vxor(*out2, *out2, x1); - x3CB4DFD2 = x22DD6699 ^ x1E69B94B; - x004B002D = a5 & ~x3CB4DFD2; - xB7B2B6B3 = ~x484D494C; - xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; - xCC82CDE5 = x004B002D ^ xCCC9CDC8; - x10 = xCC82CDE5 & ~a4; - x11 = x10 ^ x5EE1A479; - *out2 ^= x11; - - x0055EEBB = a6 ^ x00551144; - x5A5AECE9 = a1 ^ x0F0FB9BC; - x0050ECA9 = x0055EEBB & x5A5AECE9; - xC5CAC1CE = x09030C06 ^ xCCC9CDC8; - xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; - x00 = x0FFFB9FD & ~a4; - x01 = x00 ^ xC59A2D67; - *out1 ^= x01; + vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); + vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); + vxor(xB946D2B4, x9C3CA761, x257A75D5); + vsel(x2, x16E94A97, xB946D2B4, a6); + vxor(*out3, *out3, x2); } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; - u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; - u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; - u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; - u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; - u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; - u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; - u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; + u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; + u32 x738F9C63, x11EF9867, x26DA9867; + u32 x4B4B9C63, x4B666663, x4E639396; + u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; + u32 xD728827B, x6698807B, x699C585B; + u32 x778A8877, xA4A71E18, x74878E78; + u32 x204A5845, x74879639, x8B7869C6; + u32 x0, x1, x2, x3; - x0FF00FF0 = a4 ^ a5; - x3CC33CC3 = a3 ^ x0FF00FF0; - x00003CC3 = a6 & x3CC33CC3; - x0F000F00 = a4 & x0FF00FF0; - x5A555A55 = a2 ^ x0F000F00; - x00001841 = x00003CC3 & x5A555A55; + vsel(x44447777, a2, a6, a3); + vxor(x4B4B7878, a4, x44447777); + vsel(x22772277, a3, a5, a2); + vsel(x0505F5F5, a6, a2, a4); + vsel(x220522F5, x22772277, x0505F5F5, a5); + vxor(x694E5A8D, x4B4B7878, x220522F5); - x00000F00 = a6 & x0F000F00; - x33333C33 = a3 ^ x00000F00; - x7B777E77 = x5A555A55 | x33333C33; - x0FF0F00F = a6 ^ x0FF00FF0; - x74878E78 = x7B777E77 ^ x0FF0F00F; - x30 = a1 & ~x00001841; - x31 = x30 ^ x74878E78; - *out4 ^= x31; + vxor(x00FFFF00, a5, a6); + vxor(x66666666, a2, a3); + vsel(x32353235, a3, x220522F5, a4); + vsel(x26253636, x66666666, x32353235, x4B4B7878); + vxor(x26DAC936, x00FFFF00, x26253636); + vsel(x0, x26DAC936, x694E5A8D, a1); + vxor(*out1, *out1, x0); - x003C003C = a5 & ~x3CC33CC3; - x5A7D5A7D = x5A555A55 | x003C003C; - x333300F0 = x00003CC3 ^ x33333C33; - x694E5A8D = x5A7D5A7D ^ x333300F0; + vxor(x738F9C63, a2, x26DAC936); + vsel(x11EF9867, x738F9C63, a5, x66666666); + vsel(x26DA9867, x26DAC936, x11EF9867, a6); - x0FF0CCCC = x00003CC3 ^ x0FF0F00F; - x000F0303 = a4 & ~x0FF0CCCC; - x5A505854 = x5A555A55 & ~x000F0303; - x33CC000F = a5 ^ x333300F0; - x699C585B = x5A505854 ^ x33CC000F; + vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); + vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); + vxor(x4E639396, x0505F5F5, x4B666663); - x7F878F78 = x0F000F00 | x74878E78; - x21101013 = a3 & x699C585B; - x7F979F7B = x7F878F78 | x21101013; - x30030CC0 = x3CC33CC3 & ~x0FF0F00F; - x4F9493BB = x7F979F7B ^ x30030CC0; - x00 = x4F9493BB & ~a1; - x01 = x00 ^ x694E5A8D; - *out1 ^= x01; + vsel(x4E4B393C, x4B4B7878, x4E639396, a2); + vnot(xFF00FF00, a5); + vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); + vxor(xB14EE41D, x4E4B393C, xFF05DD21); + vsel(x1, xB14EE41D, x26DA9867, a1); + vxor(*out2, *out2, x1); - x6F9CDBFB = x699C585B | x4F9493BB; - x0000DBFB = a6 & x6F9CDBFB; - x00005151 = a2 & x0000DBFB; - x26DAC936 = x694E5A8D ^ x4F9493BB; - x26DA9867 = x00005151 ^ x26DAC936; + vxor(xD728827B, x66666666, xB14EE41D); + vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); + vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); + vsel(x2, x699C585B, x4E639396, a1); + vxor(*out3, *out3, x2); - x27DA9877 = x21101013 | x26DA9867; - x27DA438C = x0000DBFB ^ x27DA9877; - x2625C9C9 = a5 ^ x26DAC936; - x27FFCBCD = x27DA438C | x2625C9C9; - x20 = x27FFCBCD & a1; - x21 = x20 ^ x699C585B; - *out3 ^= x21; + vsel(x778A8877, x738F9C63, x26DAC936, x26253636); + vxor(xA4A71E18, x738F9C63, xD728827B); + vsel(x74878E78, x778A8877, xA4A71E18, a4); - x27FF1036 = x0000DBFB ^ x27FFCBCD; - x27FF103E = x003C003C | x27FF1036; - xB06B6C44 = ~x4F9493BB; - x97947C7A = x27FF103E ^ xB06B6C44; - x10 = x97947C7A & ~a1; - x11 = x10 ^ x26DA9867; - *out2 ^= x11; + vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); + vsel(x74879639, x74878E78, a3, x204A5845); + vnot(x8B7869C6, x74879639); + vsel(x3, x74878E78, x8B7869C6, a1); + vxor(*out4, *out4, x3); } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; - u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; - u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; - u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; - u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; - u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; - u32 xF700A600, x61008000, x03B7856B, x62B7056B; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; + u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; + u32 x3001F74E, x30555745, x693CD926; + u32 x0C0CD926, x0C3F25E9, x38D696A5; + u32 xC729695A; + u32 x03D2117B, xC778395B, xCB471CB2; + u32 x5425B13F, x56B3803F, x919AE965; + u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; + u32 x0, x1, x2, x3; - x0C0C0C0C = a3 & ~a2; - x0000F0F0 = a5 & ~a3; - x00FFF00F = a4 ^ x0000F0F0; - x00555005 = a1 & x00FFF00F; - x00515001 = x00555005 & ~x0C0C0C0C; + vsel(x0505F5F5, a5, a1, a3); + vxor(x05FAF50A, a4, x0505F5F5); + vsel(x0F0F00FF, a3, a4, a5); + vsel(x22227777, a2, a5, a1); + vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); + vxor(x34E9B34C, a2, x07DA807F); - x33000330 = a2 & ~x00FFF00F; - x77555775 = a1 | x33000330; - x30303030 = a2 & ~a3; - x3030CFCF = a5 ^ x30303030; - x30104745 = x77555775 & x3030CFCF; - x30555745 = x00555005 | x30104745; + vsel(x00FFF00F, x05FAF50A, a4, a3); + vsel(x0033FCCF, a5, x00FFF00F, a2); + vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); + vsel(x0C0C3F3F, a3, a5, a2); + vxor(x59698E63, x5565B15C, x0C0C3F3F); - xFF000FF0 = ~x00FFF00F; - xCF1048B5 = x30104745 ^ xFF000FF0; - x080A080A = a3 & ~x77555775; - xC71A40BF = xCF1048B5 ^ x080A080A; - xCB164CB3 = x0C0C0C0C ^ xC71A40BF; - x10 = x00515001 | a6; - x11 = x10 ^ xCB164CB3; - *out2 ^= x11; + vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); + vsel(x30555745, x3001F74E, a1, x00FFF00F); + vxor(x693CD926, x59698E63, x30555745); + vsel(x2, x693CD926, x59698E63, a6); + vxor(*out3, *out3, x2); - x9E4319E6 = a1 ^ xCB164CB3; - x000019E6 = a5 & x9E4319E6; - xF429738C = a2 ^ xC71A40BF; - xF4296A6A = x000019E6 ^ xF429738C; - xC729695A = x33000330 ^ xF4296A6A; + vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); + vxor(x0C3F25E9, x0033FCCF, x0C0CD926); + vxor(x38D696A5, x34E9B34C, x0C3F25E9); - xC47C3D2F = x30555745 ^ xF4296A6A; - xF77F3F3F = a2 | xC47C3D2F; - x9E43E619 = a5 ^ x9E4319E6; - x693CD926 = xF77F3F3F ^ x9E43E619; - x20 = x30555745 & a6; - x21 = x20 ^ x693CD926; - *out3 ^= x21; + vnot(xC729695A, x38D696A5); - xF719A695 = x3030CFCF ^ xC729695A; - xF4FF73FF = a4 | xF429738C; - x03E6D56A = xF719A695 ^ xF4FF73FF; - x56B3803F = a1 ^ x03E6D56A; - x30 = x56B3803F & a6; - x31 = x30 ^ xC729695A; - *out4 ^= x31; + vsel(x03D2117B, x07DA807F, a2, x0C0CD926); + vsel(xC778395B, xC729695A, x03D2117B, x30555745); + vxor(xCB471CB2, x0C3F25E9, xC778395B); + vsel(x1, xCB471CB2, x34E9B34C, a6); + vxor(*out2, *out2, x1); - xF700A600 = xF719A695 & ~a4; - x61008000 = x693CD926 & xF700A600; - x03B7856B = x00515001 ^ x03E6D56A; - x62B7056B = x61008000 ^ x03B7856B; - x00 = x62B7056B | a6; - x01 = x00 ^ xC729695A; - *out1 ^= x01; + vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); + vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); + vxor(x919AE965, xC729695A, x56B3803F); + vsel(x3, xC729695A, x919AE965, a6); + vxor(*out4, *out4, x3); + + vsel(x17B3023F, x07DA807F, a2, x59698E63); + vor(x75555755, a1, x30555745); + vxor(x62E6556A, x17B3023F, x75555755); + vxor(xA59E6C31, xC778395B, x62E6556A); + vsel(x0, xA59E6C31, x38D696A5, a6); + vxor(*out1, *out1, x0); } #endif diff --git a/OpenCL/m03000_a3.cl b/OpenCL/m03000_a3.cl index 920a23c00..3e01a09ba 100644 --- a/OpenCL/m03000_a3.cl +++ b/OpenCL/m03000_a3.cl @@ -21,7 +21,7 @@ #endif #ifdef IS_AMD -#define KXX_DECL +#define KXX_DECL volatile #endif #ifdef IS_GENERIC @@ -900,11 +900,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC - * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. + * Bitslice DES S-boxes making use of a vector conditional select operation + * (e.g., vsel on PowerPC with AltiVec). * - * Gate counts: 49 44 46 33 48 46 46 41 - * Average: 44.125 + * Gate counts: 36 33 33 26 35 34 34 32 + * Average: 32.875 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -923,556 +923,469 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ +#define vnot(d,a) (d) = ~(a) +#define vor(d,a,b) (d) = (a) | (b) +#define vxor(d,a,b) (d) = (a) ^ (b) +#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) + void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, - x25202160; - u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; - u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; - u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; - u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; - u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; - u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; - u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; + u32 x55AFD1B7, x3C3C69C3, x6993B874; + u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; + u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; + u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; + u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; + u32 x0DBCE883, x3A25A215, x37994A96; + u32 xC9C93B62, x89490F02, xB96C2D16; + u32 x0, x1, x2, x3; - x55005500 = a1 & ~a5; - x5A0F5A0F = a4 ^ x55005500; - x3333FFFF = a3 | a6; - x66666666 = a1 ^ a3; - x22226666 = x3333FFFF & x66666666; - x2D2D6969 = a4 ^ x22226666; - x25202160 = x2D2D6969 & ~x5A0F5A0F; + vsel(x0F0F3333, a3, a2, a5); + vxor(x3C3C3C3C, a2, a3); + vor(x55FF55FF, a1, a4); + vxor(x69C369C3, x3C3C3C3C, x55FF55FF); + vsel(x0903B73F, a5, x0F0F3333, x69C369C3); + vxor(x09FCB7C0, a4, x0903B73F); + vxor(x5CA9E295, a1, x09FCB7C0); - x00FFFF00 = a5 ^ a6; - x33CCCC33 = a3 ^ x00FFFF00; - x4803120C = x5A0F5A0F & ~x33CCCC33; - x2222FFFF = a6 | x22226666; - x6A21EDF3 = x4803120C ^ x2222FFFF; - x4A01CC93 = x6A21EDF3 & ~x25202160; + vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); + vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); + vxor(x6993B874, x55AFD1B7, x3C3C69C3); - x5555FFFF = a1 | a6; - x7F75FFFF = x6A21EDF3 | x5555FFFF; - x00D20096 = a5 & ~x2D2D6969; - x7FA7FF69 = x7F75FFFF ^ x00D20096; + vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); + vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); + vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); + vxor(x529E962D, x0F0F3333, x5D91A51E); - x0A0A0000 = a4 & ~x5555FFFF; - x0AD80096 = x00D20096 ^ x0A0A0000; - x00999900 = x00FFFF00 & ~x66666666; - x0AD99996 = x0AD80096 | x00999900; + vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); + vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); + vsel(x428679F3, a5, x4B8771A3, x529E962D); + vxor(x6B68D433, x29EEADC0, x428679F3); - x22332233 = a3 & ~x55005500; - x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; - x054885C0 = x257AA5F0 & ~x22332233; - xFAB77A3F = ~x054885C0; - x2221EDF3 = x3333FFFF & x6A21EDF3; - xD89697CC = xFAB77A3F ^ x2221EDF3; - x20 = x7FA7FF69 & ~a2; - x21 = x20 ^ xD89697CC; - *out3 ^= x21; + vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); + vsel(x026F12F3, a4, x0F0F3333, x529E962D); + vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); + vnot(x94D83B6C, x6B27C493); + vsel(x0, x94D83B6C, x6B68D433, a6); + vxor(*out1, *out1, x0); - x05B77AC0 = x00FFFF00 ^ x054885C0; - x05F77AD6 = x00D20096 | x05B77AC0; - x36C48529 = x3333FFFF ^ x05F77AD6; - x6391D07C = a1 ^ x36C48529; - xBB0747B0 = xD89697CC ^ x6391D07C; - x00 = x25202160 | a2; - x01 = x00 ^ xBB0747B0; - *out1 ^= x01; + vsel(x965E0B0F, x94D83B6C, a3, x428679F3); + vsel(x3327A113, x5BA7E193, a2, x69C369C3); + vsel(x847F0A1F, x965E0B0F, a4, x3327A113); + vxor(xD6E19C32, x529E962D, x847F0A1F); + vsel(x1, xD6E19C32, x5CA9E295, a6); + vxor(*out2, *out2, x1); - x4C460000 = x3333FFFF ^ x7F75FFFF; - x4EDF9996 = x0AD99996 | x4C460000; - x2D4E49EA = x6391D07C ^ x4EDF9996; - xBBFFFFB0 = x00FFFF00 | xBB0747B0; - x96B1B65A = x2D4E49EA ^ xBBFFFFB0; - x10 = x4A01CC93 | a2; - x11 = x10 ^ x96B1B65A; - *out2 ^= x11; + vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); + vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); + vxor(x37994A96, x0DBCE883, x3A25A215); + vsel(x3, x37994A96, x529E962D, a6); + vxor(*out4, *out4, x3); - x5AFF5AFF = a5 | x5A0F5A0F; - x52B11215 = x5AFF5AFF & ~x2D4E49EA; - x4201C010 = x4A01CC93 & x6391D07C; - x10B0D205 = x52B11215 ^ x4201C010; - x30 = x10B0D205 | a2; - x31 = x30 ^ x0AD99996; - *out4 ^= x31; + vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); + vsel(x89490F02, a3, xC9C93B62, x965E0B0F); + vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); + vsel(x2, xB96C2D16, x6993B874, a6); + vxor(*out3, *out3, x2); } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x33CC33CC; - u32 x55550000, x00AA00FF, x33BB33FF; - u32 x33CC0000, x11441144, x11BB11BB, x003311BB; - u32 x00000F0F, x336600FF, x332200FF, x332200F0; - u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; - u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; - u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; - u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; - u32 x0A451047, xBBDFDD7B, xB19ACD3C; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; + u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; + u32 x0F5AF03C, x6600FF56, x87A5F09C; + u32 xA55A963C, x3C69C30F, xB44BC32D; + u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; + u32 xB46C662D, x278DB412, xB66CB43B; + u32 xD2DC4E52, x27993333, xD2994E33; + u32 x278D0F2D, x2E0E547B, x09976748; + u32 x0, x1, x2, x3; - x33CC33CC = a2 ^ a5; + vsel(x55553333, a1, a3, a6); + vsel(x0055FF33, a6, x55553333, a5); + vsel(x33270F03, a3, a4, x0055FF33); + vxor(x66725A56, a1, x33270F03); + vxor(x00FFFF00, a5, a6); + vxor(x668DA556, x66725A56, x00FFFF00); - x55550000 = a1 & ~a6; - x00AA00FF = a5 & ~x55550000; - x33BB33FF = a2 | x00AA00FF; + vsel(x0F0F5A56, a4, x66725A56, a6); + vnot(xF0F0A5A9, x0F0F5A56); + vxor(xA5A5969A, x55553333, xF0F0A5A9); + vxor(xA55A699A, x00FFFF00, xA5A5969A); + vsel(x1, xA55A699A, x668DA556, a2); + vxor(*out2, *out2, x1); - x33CC0000 = x33CC33CC & ~a6; - x11441144 = a1 & x33CC33CC; - x11BB11BB = a5 ^ x11441144; - x003311BB = x11BB11BB & ~x33CC0000; + vxor(x0F5AF03C, a4, x0055FF33); + vsel(x6600FF56, x66725A56, a6, x00FFFF00); + vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); - x00000F0F = a3 & a6; - x336600FF = x00AA00FF ^ x33CC0000; - x332200FF = x33BB33FF & x336600FF; - x332200F0 = x332200FF & ~x00000F0F; + vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); + vxor(x3C69C30F, a3, x0F5AF03C); + vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); - x0302000F = a3 & x332200FF; - xAAAAAAAA = ~a1; - xA9A8AAA5 = x0302000F ^ xAAAAAAAA; - x33CCCC33 = a6 ^ x33CC33CC; - x33CCC030 = x33CCCC33 & ~x00000F0F; - x9A646A95 = xA9A8AAA5 ^ x33CCC030; - x10 = a4 & ~x332200F0; - x11 = x10 ^ x9A646A95; - *out2 ^= x11; + vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); + vsel(x0F4B0F2D, a4, xB44BC32D, a5); + vxor(x699CC37B, x66D7CC56, x0F4B0F2D); + vxor(x996C66D2, xF0F0A5A9, x699CC37B); + vsel(x0, x996C66D2, xB44BC32D, a2); + vxor(*out1, *out1, x0); - x00333303 = a2 & ~x33CCC030; - x118822B8 = x11BB11BB ^ x00333303; - xA8208805 = xA9A8AAA5 & ~x118822B8; - x3CC3C33C = a3 ^ x33CCCC33; - x94E34B39 = xA8208805 ^ x3CC3C33C; - x00 = x33BB33FF & ~a4; - x01 = x00 ^ x94E34B39; - *out1 ^= x01; + vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); + vsel(x278DB412, x668DA556, xA5A5969A, a1); + vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); - x0331330C = x0302000F ^ x00333303; - x3FF3F33C = x3CC3C33C | x0331330C; - xA9DF596A = x33BB33FF ^ x9A646A95; - xA9DF5F6F = x00000F0F | xA9DF596A; - x962CAC53 = x3FF3F33C ^ xA9DF5F6F; + vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); + vsel(x27993333, x278DB412, a3, x0055FF33); + vsel(xD2994E33, xD2DC4E52, x27993333, a5); + vsel(x3, x87A5F09C, xD2994E33, a2); + vxor(*out4, *out4, x3); - xA9466A6A = x332200FF ^ x9A646A95; - x3DA52153 = x94E34B39 ^ xA9466A6A; - x29850143 = xA9DF5F6F & x3DA52153; - x33C0330C = x33CC33CC & x3FF3F33C; - x1A45324F = x29850143 ^ x33C0330C; - x20 = x1A45324F | a4; - x21 = x20 ^ x962CAC53; - *out3 ^= x21; - - x0A451047 = x1A45324F & ~x118822B8; - xBBDFDD7B = x33CCCC33 | xA9DF596A; - xB19ACD3C = x0A451047 ^ xBBDFDD7B; - x30 = x003311BB | a4; - x31 = x30 ^ xB19ACD3C; - *out4 ^= x31; + vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); + vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); + vxor(x09976748, x27993333, x2E0E547B); + vsel(x2, xB66CB43B, x09976748, a2); + vxor(*out3, *out3, x2); } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; - u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; - u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; - u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; - u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; - u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; - u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; - u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0F330F33, x0F33F0CC, x5A66A599; + u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; + u32 x556BA09E, x665A93AC, x99A56C53; + u32 x25A1A797, x5713754C, x66559355, x47B135C6; + u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; + u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; + u32 xD069F8B4, x030FF0C3, xD2699876; + u32 xD579DDF4, xD579F0C3, xB32C6396; + u32 x0, x1, x2, x3; - x44444444 = a1 & ~a2; - x0F0FF0F0 = a3 ^ a6; - x4F4FF4F4 = x44444444 | x0F0FF0F0; - x00FFFF00 = a4 ^ a6; - x00AAAA00 = x00FFFF00 & ~a1; - x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; + vsel(x0F330F33, a4, a3, a5); + vxor(x0F33F0CC, a6, x0F330F33); + vxor(x5A66A599, a2, x0F33F0CC); - x3C3CC3C3 = a2 ^ x0F0FF0F0; - x3C3C0000 = x3C3CC3C3 & ~a6; - x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; - x0C840A00 = x4FE55EF4 & ~x7373F4F4; + vsel(x2111B7BB, a3, a6, x5A66A599); + vsel(x03FF3033, a5, a3, x0F33F0CC); + vsel(x05BB50EE, a5, x0F33F0CC, a2); + vsel(x074F201F, x03FF3033, a4, x05BB50EE); + vxor(x265E97A4, x2111B7BB, x074F201F); - x00005EF4 = a6 & x4FE55EF4; - x00FF5EFF = a4 | x00005EF4; - x00555455 = a1 & x00FF5EFF; - x3C699796 = x3C3CC3C3 ^ x00555455; - x30 = x4FE55EF4 & ~a5; - x31 = x30 ^ x3C699796; - *out4 ^= x31; + vsel(x556BA09E, x5A66A599, x05BB50EE, a4); + vsel(x665A93AC, x556BA09E, x265E97A4, a3); + vnot(x99A56C53, x665A93AC); + vsel(x1, x265E97A4, x99A56C53, a1); + vxor(*out2, *out2, x1); - x000FF000 = x0F0FF0F0 & x00FFFF00; - x55AA55AA = a1 ^ a4; - x26D9A15E = x7373F4F4 ^ x55AA55AA; - x2FDFAF5F = a3 | x26D9A15E; - x2FD00F5F = x2FDFAF5F & ~x000FF000; + vxor(x25A1A797, x03FF3033, x265E97A4); + vsel(x5713754C, a2, x0F33F0CC, x074F201F); + vsel(x66559355, x665A93AC, a2, a5); + vsel(x47B135C6, x25A1A797, x5713754C, x66559355); - x55AAFFAA = x00AAAA00 | x55AA55AA; - x28410014 = x3C699796 & ~x55AAFFAA; - x000000FF = a4 & a6; - x000000CC = x000000FF & ~a2; - x284100D8 = x28410014 ^ x000000CC; + vxor(x9A5A5C60, x03FF3033, x99A56C53); + vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); + vxor(x87698DB4, x5713754C, xD07AF8F8); + vxor(xE13C1EE1, x66559355, x87698DB4); - x204100D0 = x7373F4F4 & x284100D8; - x3C3CC3FF = x3C3CC3C3 | x000000FF; - x1C3CC32F = x3C3CC3FF & ~x204100D0; - x4969967A = a1 ^ x1C3CC32F; - x10 = x2FD00F5F & a5; - x11 = x10 ^ x4969967A; - *out2 ^= x11; + vsel(x000CFFCF, a4, a6, x0F33F0CC); + vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); + vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); + vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); + vsel(x0, x9E49915E, xE13C1EE1, a1); + vxor(*out1, *out1, x0); - x4CC44CC4 = x4FE55EF4 & ~a2; - x40C040C0 = x4CC44CC4 & ~a3; - xC3C33C3C = ~x3C3CC3C3; - x9669C396 = x55AAFFAA ^ xC3C33C3C; - xD6A98356 = x40C040C0 ^ x9669C396; - x00 = a5 & ~x0C840A00; - x01 = x00 ^ xD6A98356; - *out1 ^= x01; + vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); + vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); + vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); + vsel(x3, x5A66A599, xD2699876, a1); + vxor(*out4, *out4, x3); - xD6E9C3D6 = x40C040C0 | x9669C396; - x4CEEEEC4 = x00AAAA00 | x4CC44CC4; - x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; - x001A000B = a4 & ~x4FE55EF4; - x9A1F2D1B = x9A072D12 | x001A000B; - x20 = a5 & ~x284100D8; - x21 = x20 ^ x9A1F2D1B; - *out3 ^= x21; + vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); + vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); + vxor(xB32C6396, x66559355, xD579F0C3); + vsel(x2, xB32C6396, x47B135C6, a1); + vxor(*out3, *out3, x2); } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x5A5A5A5A, x0F0FF0F0; - u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, - x52FBCA0F, x61C8F93C; - u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; - u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; - u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, + x0AF50F0F, x4CA36B59; + u32 xB35C94A6; + u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; + u32 x56E9861E; + u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; + u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; + u32 x0, x1, x2, x3; - x5A5A5A5A = a1 ^ a3; - x0F0FF0F0 = a3 ^ a5; - x33FF33FF = a2 | a4; - x33FFCC00 = a5 ^ x33FF33FF; - x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; - x0C0CC0C0 = x0F0FF0F0 & ~a2; - x0CF3C03F = a4 ^ x0C0CC0C0; - x5EFBDA7F = x5A5A5A5A | x0CF3C03F; - x52FBCA0F = x5EFBDA7F & ~x0C0030F0; - x61C8F93C = a2 ^ x52FBCA0F; + vsel(x0505AFAF, a5, a3, a1); + vsel(x0555AF55, x0505AFAF, a1, a4); + vxor(x0A5AA05A, a3, x0555AF55); + vsel(x46566456, a1, x0A5AA05A, a2); + vsel(x0A0A5F5F, a3, a5, a1); + vxor(x0AF55FA0, a4, x0A0A5F5F); + vsel(x0AF50F0F, x0AF55FA0, a3, a5); + vxor(x4CA36B59, x46566456, x0AF50F0F); - x00C0C03C = x0CF3C03F & x61C8F93C; - x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; - x3B92A366 = x5A5A5A5A ^ x61C8F93C; - x30908326 = x3B92A366 & ~x0F0F30C0; - x3C90B3D6 = x0C0030F0 ^ x30908326; + vnot(xB35C94A6, x4CA36B59); - x33CC33CC = a2 ^ a4; - x0C0CFFFF = a5 | x0C0CC0C0; - x379E5C99 = x3B92A366 ^ x0C0CFFFF; - x04124C11 = x379E5C99 & ~x33CC33CC; - x56E9861E = x52FBCA0F ^ x04124C11; - x00 = a6 & ~x3C90B3D6; - x01 = x00 ^ x56E9861E; - *out1 ^= x01; + vsel(x01BB23BB, a4, a2, x0555AF55); + vxor(x5050FAFA, a1, x0505AFAF); + vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); + vxor(xA91679E1, x0A0A5F5F, xA31C26BE); - xA91679E1 = ~x56E9861E; - x10 = x3C90B3D6 & ~a6; - x11 = x10 ^ xA91679E1; - *out2 ^= x11; + vnot(x56E9861E, xA91679E1); - x9586CA37 = x3C90B3D6 ^ xA91679E1; - x8402C833 = x9586CA37 & ~x33CC33CC; - x84C2C83F = x00C0C03C | x8402C833; - xB35C94A6 = x379E5C99 ^ x84C2C83F; - x20 = x61C8F93C | a6; - x21 = x20 ^ xB35C94A6; - *out3 ^= x21; + vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); + vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); + vsel(x827D9784, xB35C94A6, x0AF55F00, a2); + vxor(xD2946D9A, x50E9FA1E, x827D9784); + vsel(x2, xD2946D9A, x4CA36B59, a6); + vxor(*out3, *out3, x2); + vsel(x3, xB35C94A6, xD2946D9A, a6); + vxor(*out4, *out4, x3); - x30 = a6 & x61C8F93C; - x31 = x30 ^ xB35C94A6; - *out4 ^= x31; + vsel(x31F720B3, a2, a4, x0AF55FA0); + vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); + vxor(x4712A7AD, x56E9861E, x11FB21B3); + vxor(x9586CA37, xD2946D9A, x4712A7AD); + vsel(x0, x56E9861E, x9586CA37, a6); + vxor(*out1, *out1, x0); + vsel(x1, x9586CA37, xA91679E1, a6); + vxor(*out2, *out2, x1); } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; - u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; - u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; - u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; - u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; - u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; - u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; - u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; + u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; + u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; + u32 xAC81CFB2, xF72577AF, x5BA4B81D; + u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; + u32 x12E6283D, x9E47D3D4, x1A676AB4; + u32 x891556DF, xE5E77F82, x6CF2295D; + u32 x2E3CA5F5, x9697C1C6, x369CC1D6; + u32 x0, x1, x2, x3; - x77777777 = a1 | a3; - x77770000 = x77777777 & ~a6; - x22225555 = a1 ^ x77770000; - x11116666 = a3 ^ x22225555; - x1F1F6F6F = a4 | x11116666; + vsel(x550F550F, a1, a3, a5); + vnot(xAAF0AAF0, x550F550F); + vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); + vxor(x96C696C6, a2, xA5F5A5F5); + vxor(x00FFFF00, a5, a6); + vxor(x963969C6, x96C696C6, x00FFFF00); - x70700000 = x77770000 & ~a4; - x43433333 = a3 ^ x70700000; - x00430033 = a5 & x43433333; - x55557777 = a1 | x11116666; - x55167744 = x00430033 ^ x55557777; - x5A19784B = a4 ^ x55167744; + vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); + vsel(xB73121F7, a2, x963969C6, x96C696C6); + vsel(x1501DF0F, a6, x550F550F, xB73121F7); + vsel(x00558A5F, x1501DF0F, a5, a1); + vxor(x2E69A463, x2E3C2E3C, x00558A5F); - x5A1987B4 = a6 ^ x5A19784B; - x7A3BD7F5 = x22225555 | x5A1987B4; - x003B00F5 = a5 & x7A3BD7F5; - x221955A0 = x22225555 ^ x003B00F5; - x05050707 = a4 & x55557777; - x271C52A7 = x221955A0 ^ x05050707; + vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); + vsel(x045157FD, a6, a1, x0679ED42); + vsel(xB32077FF, xB73121F7, a6, x045157FD); + vxor(x9D49D39C, x2E69A463, xB32077FF); + vsel(x2, x9D49D39C, x2E69A463, a4); + vxor(*out3, *out3, x2); - x2A2A82A0 = x7A3BD7F5 & ~a1; - x6969B193 = x43433333 ^ x2A2A82A0; - x1FE06F90 = a5 ^ x1F1F6F6F; - x16804E00 = x1FE06F90 & ~x6969B193; - xE97FB1FF = ~x16804E00; - x20 = xE97FB1FF & ~a2; - x21 = x20 ^ x5A19784B; - *out3 ^= x21; + vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); + vsel(xF72577AF, xB32077FF, x550F550F, a1); + vxor(x5BA4B81D, xAC81CFB2, xF72577AF); + vsel(x1, x5BA4B81D, x963969C6, a4); + vxor(*out2, *out2, x1); - x43403302 = x43433333 & ~x003B00F5; - x35CAED30 = x2A2A82A0 ^ x1FE06F90; - x37DEFFB7 = x271C52A7 | x35CAED30; - x349ECCB5 = x37DEFFB7 & ~x43403302; - x0B01234A = x1F1F6F6F & ~x349ECCB5; + vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); + vsel(x4895469F, x5BA477AF, x00558A5F, a2); + vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); + vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); - x101884B4 = x5A1987B4 & x349ECCB5; - x0FF8EB24 = x1FE06F90 ^ x101884B4; - x41413333 = x43433333 & x55557777; - x4FF9FB37 = x0FF8EB24 | x41413333; - x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; - x30 = x4FC2FBC2 & a2; - x31 = x30 ^ x271C52A7; - *out4 ^= x31; + vsel(x12E6283D, a5, x5BA4B81D, x963969C6); + vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); + vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); - x22222222 = a1 ^ x77777777; - x16BCEE97 = x349ECCB5 ^ x22222222; - x0F080B04 = a4 & x0FF8EB24; - x19B4E593 = x16BCEE97 ^ x0F080B04; - x00 = x0B01234A | a2; - x01 = x00 ^ x19B4E593; - *out1 ^= x01; + vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); + vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); + vxor(x6CF2295D, x891556DF, xE5E77F82); + vsel(x3, x1A35669A, x6CF2295D, a4); + vxor(*out4, *out4, x3); - x5C5C5C5C = x1F1F6F6F ^ x43433333; - x4448184C = x5C5C5C5C & ~x19B4E593; - x2DDABE71 = x22225555 ^ x0FF8EB24; - x6992A63D = x4448184C ^ x2DDABE71; - x10 = x1F1F6F6F & a2; - x11 = x10 ^ x6992A63D; - *out2 ^= x11; + vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); + vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); + vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); + vsel(x0, x369CC1D6, x1A676AB4, a4); + vxor(*out1, *out1, x0); } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x33CC33CC; - u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; - u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; - u32 x09030C06, x09030000, x336622FF, x3A6522FF; - u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; - u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; - u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; - u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; - u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; + u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; + u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; + u32 x86CD4C9B, x12E0FFFD, x942D9A67; + u32 x142956AB, x455D45DF, x1C3EE619; + u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; + u32 x840DBB67, x6DA19C1E, x925E63E1; + u32 x9C3CA761, x257A75D5, xB946D2B4; + u32 x0, x1, x2, x3; - x33CC33CC = a2 ^ a5; + vsel(x555500FF, a1, a4, a5); + vxor(x666633CC, a2, x555500FF); + vsel(x606F30CF, x666633CC, a4, a3); + vxor(x353A659A, a1, x606F30CF); + vxor(x353A9A65, a5, x353A659A); + vnot(xCAC5659A, x353A9A65); - x3333FFFF = a2 | a6; - x11115555 = a1 & x3333FFFF; - x22DD6699 = x33CC33CC ^ x11115555; - x22DD9966 = a6 ^ x22DD6699; - x00220099 = a5 & ~x22DD9966; + vsel(x353A6565, x353A659A, x353A9A65, a4); + vsel(x0A3F0A6F, a3, a4, x353A6565); + vxor(x6C5939A3, x666633CC, x0A3F0A6F); + vxor(x5963A3C6, x353A9A65, x6C5939A3); - x00551144 = a1 & x22DD9966; - x33662277 = a2 ^ x00551144; - x5A5A5A5A = a1 ^ a3; - x7B7E7A7F = x33662277 | x5A5A5A5A; - x59A31CE6 = x22DD6699 ^ x7B7E7A7F; + vsel(x35FF659A, a4, x353A659A, x353A6565); + vxor(x3AF06A95, a3, x35FF659A); + vsel(x05CF0A9F, a4, a3, x353A9A65); + vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); - x09030C06 = a3 & x59A31CE6; - x09030000 = x09030C06 & ~a6; - x336622FF = x00220099 | x33662277; - x3A6522FF = x09030000 ^ x336622FF; - x30 = x3A6522FF & a4; - x31 = x30 ^ x59A31CE6; - *out4 ^= x31; + vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); + vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); + vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); + vsel(x0, xCAC5659A, x942D9A67, a6); + vxor(*out1, *out1, x0); - x484D494C = a2 ^ x7B7E7A7F; - x0000B6B3 = a6 & ~x484D494C; - x0F0FB9BC = a3 ^ x0000B6B3; - x00FC00F9 = a5 & ~x09030C06; - x0FFFB9FD = x0F0FB9BC | x00FC00F9; + vsel(x142956AB, x353A659A, x942D9A67, a2); + vsel(x455D45DF, a1, x86CD4C9B, x142956AB); + vxor(x1C3EE619, x5963A3C6, x455D45DF); + vsel(x3, x5963A3C6, x1C3EE619, a6); + vxor(*out4, *out4, x3); - x5DF75DF7 = a1 | x59A31CE6; - x116600F7 = x336622FF & x5DF75DF7; - x1E69B94B = x0F0FB9BC ^ x116600F7; - x1668B94B = x1E69B94B & ~x09030000; - x20 = x00220099 | a4; - x21 = x20 ^ x1668B94B; - *out3 ^= x21; + vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); + vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); + vxor(x3CF19C86, x1C3EE619, x20CF7A9F); + vxor(x69A49C79, x555500FF, x3CF19C86); - x7B7B7B7B = a2 | x5A5A5A5A; - x411E5984 = x3A6522FF ^ x7B7B7B7B; - x1FFFFDFD = x11115555 | x0FFFB9FD; - x5EE1A479 = x411E5984 ^ x1FFFFDFD; + vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); + vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); + vnot(x925E63E1, x6DA19C1E); + vsel(x1, x925E63E1, x69A49C79, a6); + vxor(*out2, *out2, x1); - x3CB4DFD2 = x22DD6699 ^ x1E69B94B; - x004B002D = a5 & ~x3CB4DFD2; - xB7B2B6B3 = ~x484D494C; - xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; - xCC82CDE5 = x004B002D ^ xCCC9CDC8; - x10 = xCC82CDE5 & ~a4; - x11 = x10 ^ x5EE1A479; - *out2 ^= x11; - - x0055EEBB = a6 ^ x00551144; - x5A5AECE9 = a1 ^ x0F0FB9BC; - x0050ECA9 = x0055EEBB & x5A5AECE9; - xC5CAC1CE = x09030C06 ^ xCCC9CDC8; - xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; - x00 = x0FFFB9FD & ~a4; - x01 = x00 ^ xC59A2D67; - *out1 ^= x01; + vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); + vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); + vxor(xB946D2B4, x9C3CA761, x257A75D5); + vsel(x2, x16E94A97, xB946D2B4, a6); + vxor(*out3, *out3, x2); } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; - u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; - u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; - u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; - u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; - u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; - u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; - u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; + u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; + u32 x738F9C63, x11EF9867, x26DA9867; + u32 x4B4B9C63, x4B666663, x4E639396; + u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; + u32 xD728827B, x6698807B, x699C585B; + u32 x778A8877, xA4A71E18, x74878E78; + u32 x204A5845, x74879639, x8B7869C6; + u32 x0, x1, x2, x3; - x0FF00FF0 = a4 ^ a5; - x3CC33CC3 = a3 ^ x0FF00FF0; - x00003CC3 = a6 & x3CC33CC3; - x0F000F00 = a4 & x0FF00FF0; - x5A555A55 = a2 ^ x0F000F00; - x00001841 = x00003CC3 & x5A555A55; + vsel(x44447777, a2, a6, a3); + vxor(x4B4B7878, a4, x44447777); + vsel(x22772277, a3, a5, a2); + vsel(x0505F5F5, a6, a2, a4); + vsel(x220522F5, x22772277, x0505F5F5, a5); + vxor(x694E5A8D, x4B4B7878, x220522F5); - x00000F00 = a6 & x0F000F00; - x33333C33 = a3 ^ x00000F00; - x7B777E77 = x5A555A55 | x33333C33; - x0FF0F00F = a6 ^ x0FF00FF0; - x74878E78 = x7B777E77 ^ x0FF0F00F; - x30 = a1 & ~x00001841; - x31 = x30 ^ x74878E78; - *out4 ^= x31; + vxor(x00FFFF00, a5, a6); + vxor(x66666666, a2, a3); + vsel(x32353235, a3, x220522F5, a4); + vsel(x26253636, x66666666, x32353235, x4B4B7878); + vxor(x26DAC936, x00FFFF00, x26253636); + vsel(x0, x26DAC936, x694E5A8D, a1); + vxor(*out1, *out1, x0); - x003C003C = a5 & ~x3CC33CC3; - x5A7D5A7D = x5A555A55 | x003C003C; - x333300F0 = x00003CC3 ^ x33333C33; - x694E5A8D = x5A7D5A7D ^ x333300F0; + vxor(x738F9C63, a2, x26DAC936); + vsel(x11EF9867, x738F9C63, a5, x66666666); + vsel(x26DA9867, x26DAC936, x11EF9867, a6); - x0FF0CCCC = x00003CC3 ^ x0FF0F00F; - x000F0303 = a4 & ~x0FF0CCCC; - x5A505854 = x5A555A55 & ~x000F0303; - x33CC000F = a5 ^ x333300F0; - x699C585B = x5A505854 ^ x33CC000F; + vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); + vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); + vxor(x4E639396, x0505F5F5, x4B666663); - x7F878F78 = x0F000F00 | x74878E78; - x21101013 = a3 & x699C585B; - x7F979F7B = x7F878F78 | x21101013; - x30030CC0 = x3CC33CC3 & ~x0FF0F00F; - x4F9493BB = x7F979F7B ^ x30030CC0; - x00 = x4F9493BB & ~a1; - x01 = x00 ^ x694E5A8D; - *out1 ^= x01; + vsel(x4E4B393C, x4B4B7878, x4E639396, a2); + vnot(xFF00FF00, a5); + vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); + vxor(xB14EE41D, x4E4B393C, xFF05DD21); + vsel(x1, xB14EE41D, x26DA9867, a1); + vxor(*out2, *out2, x1); - x6F9CDBFB = x699C585B | x4F9493BB; - x0000DBFB = a6 & x6F9CDBFB; - x00005151 = a2 & x0000DBFB; - x26DAC936 = x694E5A8D ^ x4F9493BB; - x26DA9867 = x00005151 ^ x26DAC936; + vxor(xD728827B, x66666666, xB14EE41D); + vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); + vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); + vsel(x2, x699C585B, x4E639396, a1); + vxor(*out3, *out3, x2); - x27DA9877 = x21101013 | x26DA9867; - x27DA438C = x0000DBFB ^ x27DA9877; - x2625C9C9 = a5 ^ x26DAC936; - x27FFCBCD = x27DA438C | x2625C9C9; - x20 = x27FFCBCD & a1; - x21 = x20 ^ x699C585B; - *out3 ^= x21; + vsel(x778A8877, x738F9C63, x26DAC936, x26253636); + vxor(xA4A71E18, x738F9C63, xD728827B); + vsel(x74878E78, x778A8877, xA4A71E18, a4); - x27FF1036 = x0000DBFB ^ x27FFCBCD; - x27FF103E = x003C003C | x27FF1036; - xB06B6C44 = ~x4F9493BB; - x97947C7A = x27FF103E ^ xB06B6C44; - x10 = x97947C7A & ~a1; - x11 = x10 ^ x26DA9867; - *out2 ^= x11; + vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); + vsel(x74879639, x74878E78, a3, x204A5845); + vnot(x8B7869C6, x74879639); + vsel(x3, x74878E78, x8B7869C6, a1); + vxor(*out4, *out4, x3); } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; - u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; - u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; - u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; - u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; - u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; - u32 xF700A600, x61008000, x03B7856B, x62B7056B; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; + u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; + u32 x3001F74E, x30555745, x693CD926; + u32 x0C0CD926, x0C3F25E9, x38D696A5; + u32 xC729695A; + u32 x03D2117B, xC778395B, xCB471CB2; + u32 x5425B13F, x56B3803F, x919AE965; + u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; + u32 x0, x1, x2, x3; - x0C0C0C0C = a3 & ~a2; - x0000F0F0 = a5 & ~a3; - x00FFF00F = a4 ^ x0000F0F0; - x00555005 = a1 & x00FFF00F; - x00515001 = x00555005 & ~x0C0C0C0C; + vsel(x0505F5F5, a5, a1, a3); + vxor(x05FAF50A, a4, x0505F5F5); + vsel(x0F0F00FF, a3, a4, a5); + vsel(x22227777, a2, a5, a1); + vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); + vxor(x34E9B34C, a2, x07DA807F); - x33000330 = a2 & ~x00FFF00F; - x77555775 = a1 | x33000330; - x30303030 = a2 & ~a3; - x3030CFCF = a5 ^ x30303030; - x30104745 = x77555775 & x3030CFCF; - x30555745 = x00555005 | x30104745; + vsel(x00FFF00F, x05FAF50A, a4, a3); + vsel(x0033FCCF, a5, x00FFF00F, a2); + vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); + vsel(x0C0C3F3F, a3, a5, a2); + vxor(x59698E63, x5565B15C, x0C0C3F3F); - xFF000FF0 = ~x00FFF00F; - xCF1048B5 = x30104745 ^ xFF000FF0; - x080A080A = a3 & ~x77555775; - xC71A40BF = xCF1048B5 ^ x080A080A; - xCB164CB3 = x0C0C0C0C ^ xC71A40BF; - x10 = x00515001 | a6; - x11 = x10 ^ xCB164CB3; - *out2 ^= x11; + vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); + vsel(x30555745, x3001F74E, a1, x00FFF00F); + vxor(x693CD926, x59698E63, x30555745); + vsel(x2, x693CD926, x59698E63, a6); + vxor(*out3, *out3, x2); - x9E4319E6 = a1 ^ xCB164CB3; - x000019E6 = a5 & x9E4319E6; - xF429738C = a2 ^ xC71A40BF; - xF4296A6A = x000019E6 ^ xF429738C; - xC729695A = x33000330 ^ xF4296A6A; + vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); + vxor(x0C3F25E9, x0033FCCF, x0C0CD926); + vxor(x38D696A5, x34E9B34C, x0C3F25E9); - xC47C3D2F = x30555745 ^ xF4296A6A; - xF77F3F3F = a2 | xC47C3D2F; - x9E43E619 = a5 ^ x9E4319E6; - x693CD926 = xF77F3F3F ^ x9E43E619; - x20 = x30555745 & a6; - x21 = x20 ^ x693CD926; - *out3 ^= x21; + vnot(xC729695A, x38D696A5); - xF719A695 = x3030CFCF ^ xC729695A; - xF4FF73FF = a4 | xF429738C; - x03E6D56A = xF719A695 ^ xF4FF73FF; - x56B3803F = a1 ^ x03E6D56A; - x30 = x56B3803F & a6; - x31 = x30 ^ xC729695A; - *out4 ^= x31; + vsel(x03D2117B, x07DA807F, a2, x0C0CD926); + vsel(xC778395B, xC729695A, x03D2117B, x30555745); + vxor(xCB471CB2, x0C3F25E9, xC778395B); + vsel(x1, xCB471CB2, x34E9B34C, a6); + vxor(*out2, *out2, x1); - xF700A600 = xF719A695 & ~a4; - x61008000 = x693CD926 & xF700A600; - x03B7856B = x00515001 ^ x03E6D56A; - x62B7056B = x61008000 ^ x03B7856B; - x00 = x62B7056B | a6; - x01 = x00 ^ xC729695A; - *out1 ^= x01; + vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); + vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); + vxor(x919AE965, xC729695A, x56B3803F); + vsel(x3, xC729695A, x919AE965, a6); + vxor(*out4, *out4, x3); + + vsel(x17B3023F, x07DA807F, a2, x59698E63); + vor(x75555755, a1, x30555745); + vxor(x62E6556A, x17B3023F, x75555755); + vxor(xA59E6C31, xC778395B, x62E6556A); + vsel(x0, xA59E6C31, x38D696A5, a6); + vxor(*out1, *out1, x0); } #endif diff --git a/OpenCL/m14000_a3.cl b/OpenCL/m14000_a3.cl index 28bc677db..e961a3453 100644 --- a/OpenCL/m14000_a3.cl +++ b/OpenCL/m14000_a3.cl @@ -21,7 +21,7 @@ #endif #ifdef IS_AMD -#define KXX_DECL +#define KXX_DECL volatile #endif #ifdef IS_GENERIC @@ -900,11 +900,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC - * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. + * Bitslice DES S-boxes making use of a vector conditional select operation + * (e.g., vsel on PowerPC with AltiVec). * - * Gate counts: 49 44 46 33 48 46 46 41 - * Average: 44.125 + * Gate counts: 36 33 33 26 35 34 34 32 + * Average: 32.875 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -923,556 +923,469 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ +#define vnot(d,a) (d) = ~(a) +#define vor(d,a,b) (d) = (a) | (b) +#define vxor(d,a,b) (d) = (a) ^ (b) +#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) + void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, - x25202160; - u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; - u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; - u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; - u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; - u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; - u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; - u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; + u32 x55AFD1B7, x3C3C69C3, x6993B874; + u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; + u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; + u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; + u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; + u32 x0DBCE883, x3A25A215, x37994A96; + u32 xC9C93B62, x89490F02, xB96C2D16; + u32 x0, x1, x2, x3; - x55005500 = a1 & ~a5; - x5A0F5A0F = a4 ^ x55005500; - x3333FFFF = a3 | a6; - x66666666 = a1 ^ a3; - x22226666 = x3333FFFF & x66666666; - x2D2D6969 = a4 ^ x22226666; - x25202160 = x2D2D6969 & ~x5A0F5A0F; + vsel(x0F0F3333, a3, a2, a5); + vxor(x3C3C3C3C, a2, a3); + vor(x55FF55FF, a1, a4); + vxor(x69C369C3, x3C3C3C3C, x55FF55FF); + vsel(x0903B73F, a5, x0F0F3333, x69C369C3); + vxor(x09FCB7C0, a4, x0903B73F); + vxor(x5CA9E295, a1, x09FCB7C0); - x00FFFF00 = a5 ^ a6; - x33CCCC33 = a3 ^ x00FFFF00; - x4803120C = x5A0F5A0F & ~x33CCCC33; - x2222FFFF = a6 | x22226666; - x6A21EDF3 = x4803120C ^ x2222FFFF; - x4A01CC93 = x6A21EDF3 & ~x25202160; + vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); + vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); + vxor(x6993B874, x55AFD1B7, x3C3C69C3); - x5555FFFF = a1 | a6; - x7F75FFFF = x6A21EDF3 | x5555FFFF; - x00D20096 = a5 & ~x2D2D6969; - x7FA7FF69 = x7F75FFFF ^ x00D20096; + vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); + vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); + vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); + vxor(x529E962D, x0F0F3333, x5D91A51E); - x0A0A0000 = a4 & ~x5555FFFF; - x0AD80096 = x00D20096 ^ x0A0A0000; - x00999900 = x00FFFF00 & ~x66666666; - x0AD99996 = x0AD80096 | x00999900; + vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); + vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); + vsel(x428679F3, a5, x4B8771A3, x529E962D); + vxor(x6B68D433, x29EEADC0, x428679F3); - x22332233 = a3 & ~x55005500; - x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; - x054885C0 = x257AA5F0 & ~x22332233; - xFAB77A3F = ~x054885C0; - x2221EDF3 = x3333FFFF & x6A21EDF3; - xD89697CC = xFAB77A3F ^ x2221EDF3; - x20 = x7FA7FF69 & ~a2; - x21 = x20 ^ xD89697CC; - *out3 ^= x21; + vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); + vsel(x026F12F3, a4, x0F0F3333, x529E962D); + vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); + vnot(x94D83B6C, x6B27C493); + vsel(x0, x94D83B6C, x6B68D433, a6); + vxor(*out1, *out1, x0); - x05B77AC0 = x00FFFF00 ^ x054885C0; - x05F77AD6 = x00D20096 | x05B77AC0; - x36C48529 = x3333FFFF ^ x05F77AD6; - x6391D07C = a1 ^ x36C48529; - xBB0747B0 = xD89697CC ^ x6391D07C; - x00 = x25202160 | a2; - x01 = x00 ^ xBB0747B0; - *out1 ^= x01; + vsel(x965E0B0F, x94D83B6C, a3, x428679F3); + vsel(x3327A113, x5BA7E193, a2, x69C369C3); + vsel(x847F0A1F, x965E0B0F, a4, x3327A113); + vxor(xD6E19C32, x529E962D, x847F0A1F); + vsel(x1, xD6E19C32, x5CA9E295, a6); + vxor(*out2, *out2, x1); - x4C460000 = x3333FFFF ^ x7F75FFFF; - x4EDF9996 = x0AD99996 | x4C460000; - x2D4E49EA = x6391D07C ^ x4EDF9996; - xBBFFFFB0 = x00FFFF00 | xBB0747B0; - x96B1B65A = x2D4E49EA ^ xBBFFFFB0; - x10 = x4A01CC93 | a2; - x11 = x10 ^ x96B1B65A; - *out2 ^= x11; + vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); + vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); + vxor(x37994A96, x0DBCE883, x3A25A215); + vsel(x3, x37994A96, x529E962D, a6); + vxor(*out4, *out4, x3); - x5AFF5AFF = a5 | x5A0F5A0F; - x52B11215 = x5AFF5AFF & ~x2D4E49EA; - x4201C010 = x4A01CC93 & x6391D07C; - x10B0D205 = x52B11215 ^ x4201C010; - x30 = x10B0D205 | a2; - x31 = x30 ^ x0AD99996; - *out4 ^= x31; + vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); + vsel(x89490F02, a3, xC9C93B62, x965E0B0F); + vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); + vsel(x2, xB96C2D16, x6993B874, a6); + vxor(*out3, *out3, x2); } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x33CC33CC; - u32 x55550000, x00AA00FF, x33BB33FF; - u32 x33CC0000, x11441144, x11BB11BB, x003311BB; - u32 x00000F0F, x336600FF, x332200FF, x332200F0; - u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; - u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; - u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; - u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; - u32 x0A451047, xBBDFDD7B, xB19ACD3C; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; + u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; + u32 x0F5AF03C, x6600FF56, x87A5F09C; + u32 xA55A963C, x3C69C30F, xB44BC32D; + u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; + u32 xB46C662D, x278DB412, xB66CB43B; + u32 xD2DC4E52, x27993333, xD2994E33; + u32 x278D0F2D, x2E0E547B, x09976748; + u32 x0, x1, x2, x3; - x33CC33CC = a2 ^ a5; + vsel(x55553333, a1, a3, a6); + vsel(x0055FF33, a6, x55553333, a5); + vsel(x33270F03, a3, a4, x0055FF33); + vxor(x66725A56, a1, x33270F03); + vxor(x00FFFF00, a5, a6); + vxor(x668DA556, x66725A56, x00FFFF00); - x55550000 = a1 & ~a6; - x00AA00FF = a5 & ~x55550000; - x33BB33FF = a2 | x00AA00FF; + vsel(x0F0F5A56, a4, x66725A56, a6); + vnot(xF0F0A5A9, x0F0F5A56); + vxor(xA5A5969A, x55553333, xF0F0A5A9); + vxor(xA55A699A, x00FFFF00, xA5A5969A); + vsel(x1, xA55A699A, x668DA556, a2); + vxor(*out2, *out2, x1); - x33CC0000 = x33CC33CC & ~a6; - x11441144 = a1 & x33CC33CC; - x11BB11BB = a5 ^ x11441144; - x003311BB = x11BB11BB & ~x33CC0000; + vxor(x0F5AF03C, a4, x0055FF33); + vsel(x6600FF56, x66725A56, a6, x00FFFF00); + vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); - x00000F0F = a3 & a6; - x336600FF = x00AA00FF ^ x33CC0000; - x332200FF = x33BB33FF & x336600FF; - x332200F0 = x332200FF & ~x00000F0F; + vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); + vxor(x3C69C30F, a3, x0F5AF03C); + vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); - x0302000F = a3 & x332200FF; - xAAAAAAAA = ~a1; - xA9A8AAA5 = x0302000F ^ xAAAAAAAA; - x33CCCC33 = a6 ^ x33CC33CC; - x33CCC030 = x33CCCC33 & ~x00000F0F; - x9A646A95 = xA9A8AAA5 ^ x33CCC030; - x10 = a4 & ~x332200F0; - x11 = x10 ^ x9A646A95; - *out2 ^= x11; + vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); + vsel(x0F4B0F2D, a4, xB44BC32D, a5); + vxor(x699CC37B, x66D7CC56, x0F4B0F2D); + vxor(x996C66D2, xF0F0A5A9, x699CC37B); + vsel(x0, x996C66D2, xB44BC32D, a2); + vxor(*out1, *out1, x0); - x00333303 = a2 & ~x33CCC030; - x118822B8 = x11BB11BB ^ x00333303; - xA8208805 = xA9A8AAA5 & ~x118822B8; - x3CC3C33C = a3 ^ x33CCCC33; - x94E34B39 = xA8208805 ^ x3CC3C33C; - x00 = x33BB33FF & ~a4; - x01 = x00 ^ x94E34B39; - *out1 ^= x01; + vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); + vsel(x278DB412, x668DA556, xA5A5969A, a1); + vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); - x0331330C = x0302000F ^ x00333303; - x3FF3F33C = x3CC3C33C | x0331330C; - xA9DF596A = x33BB33FF ^ x9A646A95; - xA9DF5F6F = x00000F0F | xA9DF596A; - x962CAC53 = x3FF3F33C ^ xA9DF5F6F; + vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); + vsel(x27993333, x278DB412, a3, x0055FF33); + vsel(xD2994E33, xD2DC4E52, x27993333, a5); + vsel(x3, x87A5F09C, xD2994E33, a2); + vxor(*out4, *out4, x3); - xA9466A6A = x332200FF ^ x9A646A95; - x3DA52153 = x94E34B39 ^ xA9466A6A; - x29850143 = xA9DF5F6F & x3DA52153; - x33C0330C = x33CC33CC & x3FF3F33C; - x1A45324F = x29850143 ^ x33C0330C; - x20 = x1A45324F | a4; - x21 = x20 ^ x962CAC53; - *out3 ^= x21; - - x0A451047 = x1A45324F & ~x118822B8; - xBBDFDD7B = x33CCCC33 | xA9DF596A; - xB19ACD3C = x0A451047 ^ xBBDFDD7B; - x30 = x003311BB | a4; - x31 = x30 ^ xB19ACD3C; - *out4 ^= x31; + vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); + vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); + vxor(x09976748, x27993333, x2E0E547B); + vsel(x2, xB66CB43B, x09976748, a2); + vxor(*out3, *out3, x2); } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; - u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; - u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; - u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; - u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; - u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; - u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; - u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0F330F33, x0F33F0CC, x5A66A599; + u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; + u32 x556BA09E, x665A93AC, x99A56C53; + u32 x25A1A797, x5713754C, x66559355, x47B135C6; + u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; + u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; + u32 xD069F8B4, x030FF0C3, xD2699876; + u32 xD579DDF4, xD579F0C3, xB32C6396; + u32 x0, x1, x2, x3; - x44444444 = a1 & ~a2; - x0F0FF0F0 = a3 ^ a6; - x4F4FF4F4 = x44444444 | x0F0FF0F0; - x00FFFF00 = a4 ^ a6; - x00AAAA00 = x00FFFF00 & ~a1; - x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; + vsel(x0F330F33, a4, a3, a5); + vxor(x0F33F0CC, a6, x0F330F33); + vxor(x5A66A599, a2, x0F33F0CC); - x3C3CC3C3 = a2 ^ x0F0FF0F0; - x3C3C0000 = x3C3CC3C3 & ~a6; - x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; - x0C840A00 = x4FE55EF4 & ~x7373F4F4; + vsel(x2111B7BB, a3, a6, x5A66A599); + vsel(x03FF3033, a5, a3, x0F33F0CC); + vsel(x05BB50EE, a5, x0F33F0CC, a2); + vsel(x074F201F, x03FF3033, a4, x05BB50EE); + vxor(x265E97A4, x2111B7BB, x074F201F); - x00005EF4 = a6 & x4FE55EF4; - x00FF5EFF = a4 | x00005EF4; - x00555455 = a1 & x00FF5EFF; - x3C699796 = x3C3CC3C3 ^ x00555455; - x30 = x4FE55EF4 & ~a5; - x31 = x30 ^ x3C699796; - *out4 ^= x31; + vsel(x556BA09E, x5A66A599, x05BB50EE, a4); + vsel(x665A93AC, x556BA09E, x265E97A4, a3); + vnot(x99A56C53, x665A93AC); + vsel(x1, x265E97A4, x99A56C53, a1); + vxor(*out2, *out2, x1); - x000FF000 = x0F0FF0F0 & x00FFFF00; - x55AA55AA = a1 ^ a4; - x26D9A15E = x7373F4F4 ^ x55AA55AA; - x2FDFAF5F = a3 | x26D9A15E; - x2FD00F5F = x2FDFAF5F & ~x000FF000; + vxor(x25A1A797, x03FF3033, x265E97A4); + vsel(x5713754C, a2, x0F33F0CC, x074F201F); + vsel(x66559355, x665A93AC, a2, a5); + vsel(x47B135C6, x25A1A797, x5713754C, x66559355); - x55AAFFAA = x00AAAA00 | x55AA55AA; - x28410014 = x3C699796 & ~x55AAFFAA; - x000000FF = a4 & a6; - x000000CC = x000000FF & ~a2; - x284100D8 = x28410014 ^ x000000CC; + vxor(x9A5A5C60, x03FF3033, x99A56C53); + vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); + vxor(x87698DB4, x5713754C, xD07AF8F8); + vxor(xE13C1EE1, x66559355, x87698DB4); - x204100D0 = x7373F4F4 & x284100D8; - x3C3CC3FF = x3C3CC3C3 | x000000FF; - x1C3CC32F = x3C3CC3FF & ~x204100D0; - x4969967A = a1 ^ x1C3CC32F; - x10 = x2FD00F5F & a5; - x11 = x10 ^ x4969967A; - *out2 ^= x11; + vsel(x000CFFCF, a4, a6, x0F33F0CC); + vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); + vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); + vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); + vsel(x0, x9E49915E, xE13C1EE1, a1); + vxor(*out1, *out1, x0); - x4CC44CC4 = x4FE55EF4 & ~a2; - x40C040C0 = x4CC44CC4 & ~a3; - xC3C33C3C = ~x3C3CC3C3; - x9669C396 = x55AAFFAA ^ xC3C33C3C; - xD6A98356 = x40C040C0 ^ x9669C396; - x00 = a5 & ~x0C840A00; - x01 = x00 ^ xD6A98356; - *out1 ^= x01; + vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); + vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); + vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); + vsel(x3, x5A66A599, xD2699876, a1); + vxor(*out4, *out4, x3); - xD6E9C3D6 = x40C040C0 | x9669C396; - x4CEEEEC4 = x00AAAA00 | x4CC44CC4; - x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; - x001A000B = a4 & ~x4FE55EF4; - x9A1F2D1B = x9A072D12 | x001A000B; - x20 = a5 & ~x284100D8; - x21 = x20 ^ x9A1F2D1B; - *out3 ^= x21; + vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); + vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); + vxor(xB32C6396, x66559355, xD579F0C3); + vsel(x2, xB32C6396, x47B135C6, a1); + vxor(*out3, *out3, x2); } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x5A5A5A5A, x0F0FF0F0; - u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, - x52FBCA0F, x61C8F93C; - u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; - u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; - u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, + x0AF50F0F, x4CA36B59; + u32 xB35C94A6; + u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; + u32 x56E9861E; + u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; + u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; + u32 x0, x1, x2, x3; - x5A5A5A5A = a1 ^ a3; - x0F0FF0F0 = a3 ^ a5; - x33FF33FF = a2 | a4; - x33FFCC00 = a5 ^ x33FF33FF; - x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; - x0C0CC0C0 = x0F0FF0F0 & ~a2; - x0CF3C03F = a4 ^ x0C0CC0C0; - x5EFBDA7F = x5A5A5A5A | x0CF3C03F; - x52FBCA0F = x5EFBDA7F & ~x0C0030F0; - x61C8F93C = a2 ^ x52FBCA0F; + vsel(x0505AFAF, a5, a3, a1); + vsel(x0555AF55, x0505AFAF, a1, a4); + vxor(x0A5AA05A, a3, x0555AF55); + vsel(x46566456, a1, x0A5AA05A, a2); + vsel(x0A0A5F5F, a3, a5, a1); + vxor(x0AF55FA0, a4, x0A0A5F5F); + vsel(x0AF50F0F, x0AF55FA0, a3, a5); + vxor(x4CA36B59, x46566456, x0AF50F0F); - x00C0C03C = x0CF3C03F & x61C8F93C; - x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; - x3B92A366 = x5A5A5A5A ^ x61C8F93C; - x30908326 = x3B92A366 & ~x0F0F30C0; - x3C90B3D6 = x0C0030F0 ^ x30908326; + vnot(xB35C94A6, x4CA36B59); - x33CC33CC = a2 ^ a4; - x0C0CFFFF = a5 | x0C0CC0C0; - x379E5C99 = x3B92A366 ^ x0C0CFFFF; - x04124C11 = x379E5C99 & ~x33CC33CC; - x56E9861E = x52FBCA0F ^ x04124C11; - x00 = a6 & ~x3C90B3D6; - x01 = x00 ^ x56E9861E; - *out1 ^= x01; + vsel(x01BB23BB, a4, a2, x0555AF55); + vxor(x5050FAFA, a1, x0505AFAF); + vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); + vxor(xA91679E1, x0A0A5F5F, xA31C26BE); - xA91679E1 = ~x56E9861E; - x10 = x3C90B3D6 & ~a6; - x11 = x10 ^ xA91679E1; - *out2 ^= x11; + vnot(x56E9861E, xA91679E1); - x9586CA37 = x3C90B3D6 ^ xA91679E1; - x8402C833 = x9586CA37 & ~x33CC33CC; - x84C2C83F = x00C0C03C | x8402C833; - xB35C94A6 = x379E5C99 ^ x84C2C83F; - x20 = x61C8F93C | a6; - x21 = x20 ^ xB35C94A6; - *out3 ^= x21; + vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); + vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); + vsel(x827D9784, xB35C94A6, x0AF55F00, a2); + vxor(xD2946D9A, x50E9FA1E, x827D9784); + vsel(x2, xD2946D9A, x4CA36B59, a6); + vxor(*out3, *out3, x2); + vsel(x3, xB35C94A6, xD2946D9A, a6); + vxor(*out4, *out4, x3); - x30 = a6 & x61C8F93C; - x31 = x30 ^ xB35C94A6; - *out4 ^= x31; + vsel(x31F720B3, a2, a4, x0AF55FA0); + vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); + vxor(x4712A7AD, x56E9861E, x11FB21B3); + vxor(x9586CA37, xD2946D9A, x4712A7AD); + vsel(x0, x56E9861E, x9586CA37, a6); + vxor(*out1, *out1, x0); + vsel(x1, x9586CA37, xA91679E1, a6); + vxor(*out2, *out2, x1); } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; - u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; - u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; - u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; - u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; - u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; - u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; - u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; + u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; + u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; + u32 xAC81CFB2, xF72577AF, x5BA4B81D; + u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; + u32 x12E6283D, x9E47D3D4, x1A676AB4; + u32 x891556DF, xE5E77F82, x6CF2295D; + u32 x2E3CA5F5, x9697C1C6, x369CC1D6; + u32 x0, x1, x2, x3; - x77777777 = a1 | a3; - x77770000 = x77777777 & ~a6; - x22225555 = a1 ^ x77770000; - x11116666 = a3 ^ x22225555; - x1F1F6F6F = a4 | x11116666; + vsel(x550F550F, a1, a3, a5); + vnot(xAAF0AAF0, x550F550F); + vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); + vxor(x96C696C6, a2, xA5F5A5F5); + vxor(x00FFFF00, a5, a6); + vxor(x963969C6, x96C696C6, x00FFFF00); - x70700000 = x77770000 & ~a4; - x43433333 = a3 ^ x70700000; - x00430033 = a5 & x43433333; - x55557777 = a1 | x11116666; - x55167744 = x00430033 ^ x55557777; - x5A19784B = a4 ^ x55167744; + vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); + vsel(xB73121F7, a2, x963969C6, x96C696C6); + vsel(x1501DF0F, a6, x550F550F, xB73121F7); + vsel(x00558A5F, x1501DF0F, a5, a1); + vxor(x2E69A463, x2E3C2E3C, x00558A5F); - x5A1987B4 = a6 ^ x5A19784B; - x7A3BD7F5 = x22225555 | x5A1987B4; - x003B00F5 = a5 & x7A3BD7F5; - x221955A0 = x22225555 ^ x003B00F5; - x05050707 = a4 & x55557777; - x271C52A7 = x221955A0 ^ x05050707; + vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); + vsel(x045157FD, a6, a1, x0679ED42); + vsel(xB32077FF, xB73121F7, a6, x045157FD); + vxor(x9D49D39C, x2E69A463, xB32077FF); + vsel(x2, x9D49D39C, x2E69A463, a4); + vxor(*out3, *out3, x2); - x2A2A82A0 = x7A3BD7F5 & ~a1; - x6969B193 = x43433333 ^ x2A2A82A0; - x1FE06F90 = a5 ^ x1F1F6F6F; - x16804E00 = x1FE06F90 & ~x6969B193; - xE97FB1FF = ~x16804E00; - x20 = xE97FB1FF & ~a2; - x21 = x20 ^ x5A19784B; - *out3 ^= x21; + vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); + vsel(xF72577AF, xB32077FF, x550F550F, a1); + vxor(x5BA4B81D, xAC81CFB2, xF72577AF); + vsel(x1, x5BA4B81D, x963969C6, a4); + vxor(*out2, *out2, x1); - x43403302 = x43433333 & ~x003B00F5; - x35CAED30 = x2A2A82A0 ^ x1FE06F90; - x37DEFFB7 = x271C52A7 | x35CAED30; - x349ECCB5 = x37DEFFB7 & ~x43403302; - x0B01234A = x1F1F6F6F & ~x349ECCB5; + vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); + vsel(x4895469F, x5BA477AF, x00558A5F, a2); + vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); + vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); - x101884B4 = x5A1987B4 & x349ECCB5; - x0FF8EB24 = x1FE06F90 ^ x101884B4; - x41413333 = x43433333 & x55557777; - x4FF9FB37 = x0FF8EB24 | x41413333; - x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; - x30 = x4FC2FBC2 & a2; - x31 = x30 ^ x271C52A7; - *out4 ^= x31; + vsel(x12E6283D, a5, x5BA4B81D, x963969C6); + vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); + vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); - x22222222 = a1 ^ x77777777; - x16BCEE97 = x349ECCB5 ^ x22222222; - x0F080B04 = a4 & x0FF8EB24; - x19B4E593 = x16BCEE97 ^ x0F080B04; - x00 = x0B01234A | a2; - x01 = x00 ^ x19B4E593; - *out1 ^= x01; + vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); + vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); + vxor(x6CF2295D, x891556DF, xE5E77F82); + vsel(x3, x1A35669A, x6CF2295D, a4); + vxor(*out4, *out4, x3); - x5C5C5C5C = x1F1F6F6F ^ x43433333; - x4448184C = x5C5C5C5C & ~x19B4E593; - x2DDABE71 = x22225555 ^ x0FF8EB24; - x6992A63D = x4448184C ^ x2DDABE71; - x10 = x1F1F6F6F & a2; - x11 = x10 ^ x6992A63D; - *out2 ^= x11; + vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); + vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); + vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); + vsel(x0, x369CC1D6, x1A676AB4, a4); + vxor(*out1, *out1, x0); } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x33CC33CC; - u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; - u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; - u32 x09030C06, x09030000, x336622FF, x3A6522FF; - u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; - u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; - u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; - u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; - u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; + u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; + u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; + u32 x86CD4C9B, x12E0FFFD, x942D9A67; + u32 x142956AB, x455D45DF, x1C3EE619; + u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; + u32 x840DBB67, x6DA19C1E, x925E63E1; + u32 x9C3CA761, x257A75D5, xB946D2B4; + u32 x0, x1, x2, x3; - x33CC33CC = a2 ^ a5; + vsel(x555500FF, a1, a4, a5); + vxor(x666633CC, a2, x555500FF); + vsel(x606F30CF, x666633CC, a4, a3); + vxor(x353A659A, a1, x606F30CF); + vxor(x353A9A65, a5, x353A659A); + vnot(xCAC5659A, x353A9A65); - x3333FFFF = a2 | a6; - x11115555 = a1 & x3333FFFF; - x22DD6699 = x33CC33CC ^ x11115555; - x22DD9966 = a6 ^ x22DD6699; - x00220099 = a5 & ~x22DD9966; + vsel(x353A6565, x353A659A, x353A9A65, a4); + vsel(x0A3F0A6F, a3, a4, x353A6565); + vxor(x6C5939A3, x666633CC, x0A3F0A6F); + vxor(x5963A3C6, x353A9A65, x6C5939A3); - x00551144 = a1 & x22DD9966; - x33662277 = a2 ^ x00551144; - x5A5A5A5A = a1 ^ a3; - x7B7E7A7F = x33662277 | x5A5A5A5A; - x59A31CE6 = x22DD6699 ^ x7B7E7A7F; + vsel(x35FF659A, a4, x353A659A, x353A6565); + vxor(x3AF06A95, a3, x35FF659A); + vsel(x05CF0A9F, a4, a3, x353A9A65); + vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); - x09030C06 = a3 & x59A31CE6; - x09030000 = x09030C06 & ~a6; - x336622FF = x00220099 | x33662277; - x3A6522FF = x09030000 ^ x336622FF; - x30 = x3A6522FF & a4; - x31 = x30 ^ x59A31CE6; - *out4 ^= x31; + vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); + vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); + vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); + vsel(x0, xCAC5659A, x942D9A67, a6); + vxor(*out1, *out1, x0); - x484D494C = a2 ^ x7B7E7A7F; - x0000B6B3 = a6 & ~x484D494C; - x0F0FB9BC = a3 ^ x0000B6B3; - x00FC00F9 = a5 & ~x09030C06; - x0FFFB9FD = x0F0FB9BC | x00FC00F9; + vsel(x142956AB, x353A659A, x942D9A67, a2); + vsel(x455D45DF, a1, x86CD4C9B, x142956AB); + vxor(x1C3EE619, x5963A3C6, x455D45DF); + vsel(x3, x5963A3C6, x1C3EE619, a6); + vxor(*out4, *out4, x3); - x5DF75DF7 = a1 | x59A31CE6; - x116600F7 = x336622FF & x5DF75DF7; - x1E69B94B = x0F0FB9BC ^ x116600F7; - x1668B94B = x1E69B94B & ~x09030000; - x20 = x00220099 | a4; - x21 = x20 ^ x1668B94B; - *out3 ^= x21; + vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); + vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); + vxor(x3CF19C86, x1C3EE619, x20CF7A9F); + vxor(x69A49C79, x555500FF, x3CF19C86); - x7B7B7B7B = a2 | x5A5A5A5A; - x411E5984 = x3A6522FF ^ x7B7B7B7B; - x1FFFFDFD = x11115555 | x0FFFB9FD; - x5EE1A479 = x411E5984 ^ x1FFFFDFD; + vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); + vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); + vnot(x925E63E1, x6DA19C1E); + vsel(x1, x925E63E1, x69A49C79, a6); + vxor(*out2, *out2, x1); - x3CB4DFD2 = x22DD6699 ^ x1E69B94B; - x004B002D = a5 & ~x3CB4DFD2; - xB7B2B6B3 = ~x484D494C; - xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; - xCC82CDE5 = x004B002D ^ xCCC9CDC8; - x10 = xCC82CDE5 & ~a4; - x11 = x10 ^ x5EE1A479; - *out2 ^= x11; - - x0055EEBB = a6 ^ x00551144; - x5A5AECE9 = a1 ^ x0F0FB9BC; - x0050ECA9 = x0055EEBB & x5A5AECE9; - xC5CAC1CE = x09030C06 ^ xCCC9CDC8; - xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; - x00 = x0FFFB9FD & ~a4; - x01 = x00 ^ xC59A2D67; - *out1 ^= x01; + vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); + vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); + vxor(xB946D2B4, x9C3CA761, x257A75D5); + vsel(x2, x16E94A97, xB946D2B4, a6); + vxor(*out3, *out3, x2); } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; - u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; - u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; - u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; - u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; - u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; - u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; - u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; + u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; + u32 x738F9C63, x11EF9867, x26DA9867; + u32 x4B4B9C63, x4B666663, x4E639396; + u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; + u32 xD728827B, x6698807B, x699C585B; + u32 x778A8877, xA4A71E18, x74878E78; + u32 x204A5845, x74879639, x8B7869C6; + u32 x0, x1, x2, x3; - x0FF00FF0 = a4 ^ a5; - x3CC33CC3 = a3 ^ x0FF00FF0; - x00003CC3 = a6 & x3CC33CC3; - x0F000F00 = a4 & x0FF00FF0; - x5A555A55 = a2 ^ x0F000F00; - x00001841 = x00003CC3 & x5A555A55; + vsel(x44447777, a2, a6, a3); + vxor(x4B4B7878, a4, x44447777); + vsel(x22772277, a3, a5, a2); + vsel(x0505F5F5, a6, a2, a4); + vsel(x220522F5, x22772277, x0505F5F5, a5); + vxor(x694E5A8D, x4B4B7878, x220522F5); - x00000F00 = a6 & x0F000F00; - x33333C33 = a3 ^ x00000F00; - x7B777E77 = x5A555A55 | x33333C33; - x0FF0F00F = a6 ^ x0FF00FF0; - x74878E78 = x7B777E77 ^ x0FF0F00F; - x30 = a1 & ~x00001841; - x31 = x30 ^ x74878E78; - *out4 ^= x31; + vxor(x00FFFF00, a5, a6); + vxor(x66666666, a2, a3); + vsel(x32353235, a3, x220522F5, a4); + vsel(x26253636, x66666666, x32353235, x4B4B7878); + vxor(x26DAC936, x00FFFF00, x26253636); + vsel(x0, x26DAC936, x694E5A8D, a1); + vxor(*out1, *out1, x0); - x003C003C = a5 & ~x3CC33CC3; - x5A7D5A7D = x5A555A55 | x003C003C; - x333300F0 = x00003CC3 ^ x33333C33; - x694E5A8D = x5A7D5A7D ^ x333300F0; + vxor(x738F9C63, a2, x26DAC936); + vsel(x11EF9867, x738F9C63, a5, x66666666); + vsel(x26DA9867, x26DAC936, x11EF9867, a6); - x0FF0CCCC = x00003CC3 ^ x0FF0F00F; - x000F0303 = a4 & ~x0FF0CCCC; - x5A505854 = x5A555A55 & ~x000F0303; - x33CC000F = a5 ^ x333300F0; - x699C585B = x5A505854 ^ x33CC000F; + vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); + vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); + vxor(x4E639396, x0505F5F5, x4B666663); - x7F878F78 = x0F000F00 | x74878E78; - x21101013 = a3 & x699C585B; - x7F979F7B = x7F878F78 | x21101013; - x30030CC0 = x3CC33CC3 & ~x0FF0F00F; - x4F9493BB = x7F979F7B ^ x30030CC0; - x00 = x4F9493BB & ~a1; - x01 = x00 ^ x694E5A8D; - *out1 ^= x01; + vsel(x4E4B393C, x4B4B7878, x4E639396, a2); + vnot(xFF00FF00, a5); + vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); + vxor(xB14EE41D, x4E4B393C, xFF05DD21); + vsel(x1, xB14EE41D, x26DA9867, a1); + vxor(*out2, *out2, x1); - x6F9CDBFB = x699C585B | x4F9493BB; - x0000DBFB = a6 & x6F9CDBFB; - x00005151 = a2 & x0000DBFB; - x26DAC936 = x694E5A8D ^ x4F9493BB; - x26DA9867 = x00005151 ^ x26DAC936; + vxor(xD728827B, x66666666, xB14EE41D); + vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); + vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); + vsel(x2, x699C585B, x4E639396, a1); + vxor(*out3, *out3, x2); - x27DA9877 = x21101013 | x26DA9867; - x27DA438C = x0000DBFB ^ x27DA9877; - x2625C9C9 = a5 ^ x26DAC936; - x27FFCBCD = x27DA438C | x2625C9C9; - x20 = x27FFCBCD & a1; - x21 = x20 ^ x699C585B; - *out3 ^= x21; + vsel(x778A8877, x738F9C63, x26DAC936, x26253636); + vxor(xA4A71E18, x738F9C63, xD728827B); + vsel(x74878E78, x778A8877, xA4A71E18, a4); - x27FF1036 = x0000DBFB ^ x27FFCBCD; - x27FF103E = x003C003C | x27FF1036; - xB06B6C44 = ~x4F9493BB; - x97947C7A = x27FF103E ^ xB06B6C44; - x10 = x97947C7A & ~a1; - x11 = x10 ^ x26DA9867; - *out2 ^= x11; + vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); + vsel(x74879639, x74878E78, a3, x204A5845); + vnot(x8B7869C6, x74879639); + vsel(x3, x74878E78, x8B7869C6, a1); + vxor(*out4, *out4, x3); } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; - u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; - u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; - u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; - u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; - u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; - u32 xF700A600, x61008000, x03B7856B, x62B7056B; - u32 x00, x01, x10, x11, x20, x21, x30, x31; + u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; + u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; + u32 x3001F74E, x30555745, x693CD926; + u32 x0C0CD926, x0C3F25E9, x38D696A5; + u32 xC729695A; + u32 x03D2117B, xC778395B, xCB471CB2; + u32 x5425B13F, x56B3803F, x919AE965; + u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; + u32 x0, x1, x2, x3; - x0C0C0C0C = a3 & ~a2; - x0000F0F0 = a5 & ~a3; - x00FFF00F = a4 ^ x0000F0F0; - x00555005 = a1 & x00FFF00F; - x00515001 = x00555005 & ~x0C0C0C0C; + vsel(x0505F5F5, a5, a1, a3); + vxor(x05FAF50A, a4, x0505F5F5); + vsel(x0F0F00FF, a3, a4, a5); + vsel(x22227777, a2, a5, a1); + vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); + vxor(x34E9B34C, a2, x07DA807F); - x33000330 = a2 & ~x00FFF00F; - x77555775 = a1 | x33000330; - x30303030 = a2 & ~a3; - x3030CFCF = a5 ^ x30303030; - x30104745 = x77555775 & x3030CFCF; - x30555745 = x00555005 | x30104745; + vsel(x00FFF00F, x05FAF50A, a4, a3); + vsel(x0033FCCF, a5, x00FFF00F, a2); + vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); + vsel(x0C0C3F3F, a3, a5, a2); + vxor(x59698E63, x5565B15C, x0C0C3F3F); - xFF000FF0 = ~x00FFF00F; - xCF1048B5 = x30104745 ^ xFF000FF0; - x080A080A = a3 & ~x77555775; - xC71A40BF = xCF1048B5 ^ x080A080A; - xCB164CB3 = x0C0C0C0C ^ xC71A40BF; - x10 = x00515001 | a6; - x11 = x10 ^ xCB164CB3; - *out2 ^= x11; + vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); + vsel(x30555745, x3001F74E, a1, x00FFF00F); + vxor(x693CD926, x59698E63, x30555745); + vsel(x2, x693CD926, x59698E63, a6); + vxor(*out3, *out3, x2); - x9E4319E6 = a1 ^ xCB164CB3; - x000019E6 = a5 & x9E4319E6; - xF429738C = a2 ^ xC71A40BF; - xF4296A6A = x000019E6 ^ xF429738C; - xC729695A = x33000330 ^ xF4296A6A; + vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); + vxor(x0C3F25E9, x0033FCCF, x0C0CD926); + vxor(x38D696A5, x34E9B34C, x0C3F25E9); - xC47C3D2F = x30555745 ^ xF4296A6A; - xF77F3F3F = a2 | xC47C3D2F; - x9E43E619 = a5 ^ x9E4319E6; - x693CD926 = xF77F3F3F ^ x9E43E619; - x20 = x30555745 & a6; - x21 = x20 ^ x693CD926; - *out3 ^= x21; + vnot(xC729695A, x38D696A5); - xF719A695 = x3030CFCF ^ xC729695A; - xF4FF73FF = a4 | xF429738C; - x03E6D56A = xF719A695 ^ xF4FF73FF; - x56B3803F = a1 ^ x03E6D56A; - x30 = x56B3803F & a6; - x31 = x30 ^ xC729695A; - *out4 ^= x31; + vsel(x03D2117B, x07DA807F, a2, x0C0CD926); + vsel(xC778395B, xC729695A, x03D2117B, x30555745); + vxor(xCB471CB2, x0C3F25E9, xC778395B); + vsel(x1, xCB471CB2, x34E9B34C, a6); + vxor(*out2, *out2, x1); - xF700A600 = xF719A695 & ~a4; - x61008000 = x693CD926 & xF700A600; - x03B7856B = x00515001 ^ x03E6D56A; - x62B7056B = x61008000 ^ x03B7856B; - x00 = x62B7056B | a6; - x01 = x00 ^ xC729695A; - *out1 ^= x01; + vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); + vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); + vxor(x919AE965, xC729695A, x56B3803F); + vsel(x3, xC729695A, x919AE965, a6); + vxor(*out4, *out4, x3); + + vsel(x17B3023F, x07DA807F, a2, x59698E63); + vor(x75555755, a1, x30555745); + vxor(x62E6556A, x17B3023F, x75555755); + vxor(xA59E6C31, xC778395B, x62E6556A); + vsel(x0, xA59E6C31, x38D696A5, a6); + vxor(*out1, *out1, x0); } #endif