From cb2423606707c25427d4923969da2c56b01dbf2b Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Thu, 5 Dec 2019 14:49:51 +0100 Subject: [PATCH] Inline assembly optimization for 256 bit ADD and SUB in inc_ecc_secp256k1.cl --- OpenCL/inc_ecc_secp256k1.cl | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/OpenCL/inc_ecc_secp256k1.cl b/OpenCL/inc_ecc_secp256k1.cl index 55a4469e5..33496bc60 100644 --- a/OpenCL/inc_ecc_secp256k1.cl +++ b/OpenCL/inc_ecc_secp256k1.cl @@ -108,6 +108,21 @@ DECLSPEC u32 sub (u32 r[8], const u32 a[8], const u32 b[8]) { u32 c = 0; // carry/borrow + #ifdef IS_NV + asm("sub.cc.u32 %0, %9, %17;" + "subc.cc.u32 %1, %10, %18;" + "subc.cc.u32 %2, %11, %19;" + "subc.cc.u32 %3, %12, %20;" + "subc.cc.u32 %4, %13, %21;" + "subc.cc.u32 %5, %14, %22;" + "subc.cc.u32 %6, %15, %23;" + "subc.cc.u32 %7, %16, %24;" + "subc.u32 %8, 0, 0;" + : "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]), + "=r"(c) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]), + "r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])); + #else for (u32 i = 0; i < 8; i++) { const u32 diff = a[i] - b[i] - c; @@ -115,15 +130,31 @@ DECLSPEC u32 sub (u32 r[8], const u32 a[8], const u32 b[8]) if (diff != a[i]) c = (diff > a[i]); r[i] = diff; - } + } + #endif - return c; + return c; } DECLSPEC u32 add (u32 r[8], const u32 a[8], const u32 b[8]) { u32 c = 0; // carry/borrow + #ifdef IS_NV + asm("add.cc.u32 %0, %9, %17;" + "addc.cc.u32 %1, %10, %18;" + "addc.cc.u32 %2, %11, %19;" + "addc.cc.u32 %3, %12, %20;" + "addc.cc.u32 %4, %13, %21;" + "addc.cc.u32 %5, %14, %22;" + "addc.cc.u32 %6, %15, %23;" + "addc.cc.u32 %7, %16, %24;" + "addc.u32 %8, 0, 0;" + : "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]), + "=r"(c) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]), + "r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])); + #else for (u32 i = 0; i < 8; i++) { const u32 t = a[i] + b[i] + c; @@ -132,6 +163,7 @@ DECLSPEC u32 add (u32 r[8], const u32 a[8], const u32 b[8]) r[i] = t; } + #endif return c; }