Use a different code for mod_512() to help some NV GPU to not hang

pull/2246/head
Jens Steube 5 years ago
parent 53254b45aa
commit 75b4164498

@ -369,59 +369,55 @@ DECLSPEC void mod_512 (u32 n[16])
while (a[0] >= b[0])
{
const u32 l1 = (a[ 0] < b[ 0]) << 0
| (a[ 1] < b[ 1]) << 1
| (a[ 2] < b[ 2]) << 2
| (a[ 3] < b[ 3]) << 3
| (a[ 4] < b[ 4]) << 4
| (a[ 5] < b[ 5]) << 5
| (a[ 6] < b[ 6]) << 6
| (a[ 7] < b[ 7]) << 7
| (a[ 8] < b[ 8]) << 8
| (a[ 9] < b[ 9]) << 9
| (a[10] < b[10]) << 10
| (a[11] < b[11]) << 11
| (a[12] < b[12]) << 12
| (a[13] < b[13]) << 13
| (a[14] < b[14]) << 14
| (a[15] < b[15]) << 15;
const u32 e1 = (a[ 0] == b[ 0]) << 0
| (a[ 1] == b[ 1]) << 1
| (a[ 2] == b[ 2]) << 2
| (a[ 3] == b[ 3]) << 3
| (a[ 4] == b[ 4]) << 4
| (a[ 5] == b[ 5]) << 5
| (a[ 6] == b[ 6]) << 6
| (a[ 7] == b[ 7]) << 7
| (a[ 8] == b[ 8]) << 8
| (a[ 9] == b[ 9]) << 9
| (a[10] == b[10]) << 10
| (a[11] == b[11]) << 11
| (a[12] == b[12]) << 12
| (a[13] == b[13]) << 13
| (a[14] == b[14]) << 14
| (a[15] == b[15]) << 15;
if (l1)
{
if (l1 & 0x0001) break;
if (l1 & 0x0002) if ((e1 & 0x0001) == 0x0001) break;
if (l1 & 0x0004) if ((e1 & 0x0003) == 0x0003) break;
if (l1 & 0x0008) if ((e1 & 0x0007) == 0x0007) break;
if (l1 & 0x0010) if ((e1 & 0x000f) == 0x000f) break;
if (l1 & 0x0020) if ((e1 & 0x001f) == 0x001f) break;
if (l1 & 0x0040) if ((e1 & 0x003f) == 0x003f) break;
if (l1 & 0x0080) if ((e1 & 0x007f) == 0x007f) break;
if (l1 & 0x0100) if ((e1 & 0x00ff) == 0x00ff) break;
if (l1 & 0x0200) if ((e1 & 0x01ff) == 0x01ff) break;
if (l1 & 0x0400) if ((e1 & 0x03ff) == 0x03ff) break;
if (l1 & 0x0800) if ((e1 & 0x07ff) == 0x07ff) break;
if (l1 & 0x1000) if ((e1 & 0x0fff) == 0x0fff) break;
if (l1 & 0x2000) if ((e1 & 0x1fff) == 0x1fff) break;
if (l1 & 0x4000) if ((e1 & 0x3fff) == 0x3fff) break;
if (l1 & 0x8000) if ((e1 & 0x7fff) == 0x7fff) break;
}
u32 l00 = a[ 0] < b[ 0];
u32 l01 = a[ 1] < b[ 1];
u32 l02 = a[ 2] < b[ 2];
u32 l03 = a[ 3] < b[ 3];
u32 l04 = a[ 4] < b[ 4];
u32 l05 = a[ 5] < b[ 5];
u32 l06 = a[ 6] < b[ 6];
u32 l07 = a[ 7] < b[ 7];
u32 l08 = a[ 8] < b[ 8];
u32 l09 = a[ 9] < b[ 9];
u32 l10 = a[10] < b[10];
u32 l11 = a[11] < b[11];
u32 l12 = a[12] < b[12];
u32 l13 = a[13] < b[13];
u32 l14 = a[14] < b[14];
u32 l15 = a[15] < b[15];
u32 e00 = a[ 0] == b[ 0];
u32 e01 = a[ 1] == b[ 1];
u32 e02 = a[ 2] == b[ 2];
u32 e03 = a[ 3] == b[ 3];
u32 e04 = a[ 4] == b[ 4];
u32 e05 = a[ 5] == b[ 5];
u32 e06 = a[ 6] == b[ 6];
u32 e07 = a[ 7] == b[ 7];
u32 e08 = a[ 8] == b[ 8];
u32 e09 = a[ 9] == b[ 9];
u32 e10 = a[10] == b[10];
u32 e11 = a[11] == b[11];
u32 e12 = a[12] == b[12];
u32 e13 = a[13] == b[13];
u32 e14 = a[14] == b[14];
if (l00) break;
if (l01 && e00) break;
if (l02 && e00 && e01) break;
if (l03 && e00 && e01 && e02) break;
if (l04 && e00 && e01 && e02 && e03) break;
if (l05 && e00 && e01 && e02 && e03 && e04) break;
if (l06 && e00 && e01 && e02 && e03 && e04 && e05) break;
if (l07 && e00 && e01 && e02 && e03 && e04 && e05 && e06) break;
if (l08 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07) break;
if (l09 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08) break;
if (l10 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09) break;
if (l11 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10) break;
if (l12 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11) break;
if (l13 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12) break;
if (l14 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13) break;
if (l15 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13 && e14) break;
// r = x (copy it to have the original values for the subtraction)
@ -444,80 +440,76 @@ DECLSPEC void mod_512 (u32 n[16])
r[14] = x[14];
r[15] = x[15];
// x >>= 1
x[15] = x[15] >> 1 | (x[14] & 1) << 31;
x[14] = x[14] >> 1 | (x[13] & 1) << 31;
x[13] = x[13] >> 1 | (x[12] & 1) << 31;
x[12] = x[12] >> 1 | (x[11] & 1) << 31;
x[11] = x[11] >> 1 | (x[10] & 1) << 31;
x[10] = x[10] >> 1 | (x[ 9] & 1) << 31;
x[ 9] = x[ 9] >> 1 | (x[ 8] & 1) << 31;
x[ 8] = x[ 8] >> 1 | (x[ 7] & 1) << 31;
x[ 7] = x[ 7] >> 1 | (x[ 6] & 1) << 31;
x[ 6] = x[ 6] >> 1 | (x[ 5] & 1) << 31;
x[ 5] = x[ 5] >> 1 | (x[ 4] & 1) << 31;
x[ 4] = x[ 4] >> 1 | (x[ 3] & 1) << 31;
x[ 3] = x[ 3] >> 1 | (x[ 2] & 1) << 31;
x[ 2] = x[ 2] >> 1 | (x[ 1] & 1) << 31;
x[ 1] = x[ 1] >> 1 | (x[ 0] & 1) << 31;
// x <<= 1
x[15] = x[15] >> 1 | x[14] << 31;
x[14] = x[14] >> 1 | x[13] << 31;
x[13] = x[13] >> 1 | x[12] << 31;
x[12] = x[12] >> 1 | x[11] << 31;
x[11] = x[11] >> 1 | x[10] << 31;
x[10] = x[10] >> 1 | x[ 9] << 31;
x[ 9] = x[ 9] >> 1 | x[ 8] << 31;
x[ 8] = x[ 8] >> 1 | x[ 7] << 31;
x[ 7] = x[ 7] >> 1 | x[ 6] << 31;
x[ 6] = x[ 6] >> 1 | x[ 5] << 31;
x[ 5] = x[ 5] >> 1 | x[ 4] << 31;
x[ 4] = x[ 4] >> 1 | x[ 3] << 31;
x[ 3] = x[ 3] >> 1 | x[ 2] << 31;
x[ 2] = x[ 2] >> 1 | x[ 1] << 31;
x[ 1] = x[ 1] >> 1 | x[ 0] << 31;
x[ 0] = x[ 0] >> 1;
// if (a >= r) a -= r;
const u32 l2 = (a[ 0] < r[ 0]) << 0
| (a[ 1] < r[ 1]) << 1
| (a[ 2] < r[ 2]) << 2
| (a[ 3] < r[ 3]) << 3
| (a[ 4] < r[ 4]) << 4
| (a[ 5] < r[ 5]) << 5
| (a[ 6] < r[ 6]) << 6
| (a[ 7] < r[ 7]) << 7
| (a[ 8] < r[ 8]) << 8
| (a[ 9] < r[ 9]) << 9
| (a[10] < r[10]) << 10
| (a[11] < r[11]) << 11
| (a[12] < r[12]) << 12
| (a[13] < r[13]) << 13
| (a[14] < r[14]) << 14
| (a[15] < r[15]) << 15;
const u32 e2 = (a[ 0] == r[ 0]) << 0
| (a[ 1] == r[ 1]) << 1
| (a[ 2] == r[ 2]) << 2
| (a[ 3] == r[ 3]) << 3
| (a[ 4] == r[ 4]) << 4
| (a[ 5] == r[ 5]) << 5
| (a[ 6] == r[ 6]) << 6
| (a[ 7] == r[ 7]) << 7
| (a[ 8] == r[ 8]) << 8
| (a[ 9] == r[ 9]) << 9
| (a[10] == r[10]) << 10
| (a[11] == r[11]) << 11
| (a[12] == r[12]) << 12
| (a[13] == r[13]) << 13
| (a[14] == r[14]) << 14
| (a[15] == r[15]) << 15;
if (l2)
{
if (l2 & 0x0001) continue;
if (l2 & 0x0002) if ((e2 & 0x0001) == 0x0001) continue;
if (l2 & 0x0004) if ((e2 & 0x0003) == 0x0003) continue;
if (l2 & 0x0008) if ((e2 & 0x0007) == 0x0007) continue;
if (l2 & 0x0010) if ((e2 & 0x000f) == 0x000f) continue;
if (l2 & 0x0020) if ((e2 & 0x001f) == 0x001f) continue;
if (l2 & 0x0040) if ((e2 & 0x003f) == 0x003f) continue;
if (l2 & 0x0080) if ((e2 & 0x007f) == 0x007f) continue;
if (l2 & 0x0100) if ((e2 & 0x00ff) == 0x00ff) continue;
if (l2 & 0x0200) if ((e2 & 0x01ff) == 0x01ff) continue;
if (l2 & 0x0400) if ((e2 & 0x03ff) == 0x03ff) continue;
if (l2 & 0x0800) if ((e2 & 0x07ff) == 0x07ff) continue;
if (l2 & 0x1000) if ((e2 & 0x0fff) == 0x0fff) continue;
if (l2 & 0x2000) if ((e2 & 0x1fff) == 0x1fff) continue;
if (l2 & 0x4000) if ((e2 & 0x3fff) == 0x3fff) continue;
if (l2 & 0x8000) if ((e2 & 0x7fff) == 0x7fff) continue;
}
l00 = a[ 0] < r[ 0];
l01 = a[ 1] < r[ 1];
l02 = a[ 2] < r[ 2];
l03 = a[ 3] < r[ 3];
l04 = a[ 4] < r[ 4];
l05 = a[ 5] < r[ 5];
l06 = a[ 6] < r[ 6];
l07 = a[ 7] < r[ 7];
l08 = a[ 8] < r[ 8];
l09 = a[ 9] < r[ 9];
l10 = a[10] < r[10];
l11 = a[11] < r[11];
l12 = a[12] < r[12];
l13 = a[13] < r[13];
l14 = a[14] < r[14];
l15 = a[15] < r[15];
e00 = a[ 0] == r[ 0];
e01 = a[ 1] == r[ 1];
e02 = a[ 2] == r[ 2];
e03 = a[ 3] == r[ 3];
e04 = a[ 4] == r[ 4];
e05 = a[ 5] == r[ 5];
e06 = a[ 6] == r[ 6];
e07 = a[ 7] == r[ 7];
e08 = a[ 8] == r[ 8];
e09 = a[ 9] == r[ 9];
e10 = a[10] == r[10];
e11 = a[11] == r[11];
e12 = a[12] == r[12];
e13 = a[13] == r[13];
e14 = a[14] == r[14];
if (l00) continue;
if (l01 && e00) continue;
if (l02 && e00 && e01) continue;
if (l03 && e00 && e01 && e02) continue;
if (l04 && e00 && e01 && e02 && e03) continue;
if (l05 && e00 && e01 && e02 && e03 && e04) continue;
if (l06 && e00 && e01 && e02 && e03 && e04 && e05) continue;
if (l07 && e00 && e01 && e02 && e03 && e04 && e05 && e06) continue;
if (l08 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07) continue;
if (l09 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08) continue;
if (l10 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09) continue;
if (l11 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10) continue;
if (l12 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11) continue;
if (l13 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12) continue;
if (l14 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13) continue;
if (l15 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13 && e14) continue;
// substract (a -= r):

Loading…
Cancel
Save