|
|
|
@ -108,20 +108,40 @@ DECLSPEC u32 sub (u32 r[8], const u32 a[8], const u32 b[8])
|
|
|
|
|
{
|
|
|
|
|
u32 c = 0; // carry/borrow
|
|
|
|
|
|
|
|
|
|
#ifdef IS_NV
|
|
|
|
|
asm("sub.cc.u32 %0, %9, %17;"
|
|
|
|
|
"subc.cc.u32 %1, %10, %18;"
|
|
|
|
|
"subc.cc.u32 %2, %11, %19;"
|
|
|
|
|
"subc.cc.u32 %3, %12, %20;"
|
|
|
|
|
"subc.cc.u32 %4, %13, %21;"
|
|
|
|
|
"subc.cc.u32 %5, %14, %22;"
|
|
|
|
|
"subc.cc.u32 %6, %15, %23;"
|
|
|
|
|
"subc.cc.u32 %7, %16, %24;"
|
|
|
|
|
"subc.u32 %8, 0, 0;"
|
|
|
|
|
: "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
|
|
|
|
|
"=r"(c)
|
|
|
|
|
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
|
|
|
|
|
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7]));
|
|
|
|
|
#if defined IS_NV && HAS_SUB == 1 && HAS_SUBC == 1
|
|
|
|
|
asm volatile
|
|
|
|
|
(
|
|
|
|
|
"sub.cc.u32 %0, %9, %17;"
|
|
|
|
|
"subc.cc.u32 %1, %10, %18;"
|
|
|
|
|
"subc.cc.u32 %2, %11, %19;"
|
|
|
|
|
"subc.cc.u32 %3, %12, %20;"
|
|
|
|
|
"subc.cc.u32 %4, %13, %21;"
|
|
|
|
|
"subc.cc.u32 %5, %14, %22;"
|
|
|
|
|
"subc.cc.u32 %6, %15, %23;"
|
|
|
|
|
"subc.cc.u32 %7, %16, %24;"
|
|
|
|
|
"subc.u32 %8, 0, 0;"
|
|
|
|
|
: "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
|
|
|
|
|
"=r"(c)
|
|
|
|
|
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
|
|
|
|
|
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])
|
|
|
|
|
);
|
|
|
|
|
#elif defined IS_AMD && HAS_VSUB == 1 && HAS_VSUBB == 1
|
|
|
|
|
__asm__ __volatile__
|
|
|
|
|
(
|
|
|
|
|
"V_SUB_U32 %0, %9, %17;"
|
|
|
|
|
"V_SUBB_U32 %1, %10, %18;"
|
|
|
|
|
"V_SUBB_U32 %2, %11, %19;"
|
|
|
|
|
"V_SUBB_U32 %3, %12, %20;"
|
|
|
|
|
"V_SUBB_U32 %4, %13, %21;"
|
|
|
|
|
"V_SUBB_U32 %5, %14, %22;"
|
|
|
|
|
"V_SUBB_U32 %6, %15, %23;"
|
|
|
|
|
"V_SUBB_U32 %7, %16, %24;"
|
|
|
|
|
"V_SUBB_U32 %8, 0, 0;"
|
|
|
|
|
: "=v"(r[0]), "=v"(r[1]), "=v"(r[2]), "=v"(r[3]), "=v"(r[4]), "=v"(r[5]), "=v"(r[6]), "=v"(r[7]),
|
|
|
|
|
"=v"(c)
|
|
|
|
|
: "v"(a[0]), "v"(a[1]), "v"(a[2]), "v"(a[3]), "v"(a[4]), "v"(a[5]), "v"(a[6]), "v"(a[7]),
|
|
|
|
|
"v"(b[0]), "v"(b[1]), "v"(b[2]), "v"(b[3]), "v"(b[4]), "v"(b[5]), "v"(b[6]), "v"(b[7])
|
|
|
|
|
);
|
|
|
|
|
#else
|
|
|
|
|
for (u32 i = 0; i < 8; i++)
|
|
|
|
|
{
|
|
|
|
@ -140,20 +160,40 @@ DECLSPEC u32 add (u32 r[8], const u32 a[8], const u32 b[8])
|
|
|
|
|
{
|
|
|
|
|
u32 c = 0; // carry/borrow
|
|
|
|
|
|
|
|
|
|
#ifdef IS_NV
|
|
|
|
|
asm("add.cc.u32 %0, %9, %17;"
|
|
|
|
|
"addc.cc.u32 %1, %10, %18;"
|
|
|
|
|
"addc.cc.u32 %2, %11, %19;"
|
|
|
|
|
"addc.cc.u32 %3, %12, %20;"
|
|
|
|
|
"addc.cc.u32 %4, %13, %21;"
|
|
|
|
|
"addc.cc.u32 %5, %14, %22;"
|
|
|
|
|
"addc.cc.u32 %6, %15, %23;"
|
|
|
|
|
"addc.cc.u32 %7, %16, %24;"
|
|
|
|
|
"addc.u32 %8, 0, 0;"
|
|
|
|
|
: "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
|
|
|
|
|
"=r"(c)
|
|
|
|
|
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
|
|
|
|
|
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7]));
|
|
|
|
|
#if defined IS_NV && HAS_ADD == 1 && HAS_ADDC == 1
|
|
|
|
|
asm volatile
|
|
|
|
|
(
|
|
|
|
|
"add.cc.u32 %0, %9, %17;"
|
|
|
|
|
"addc.cc.u32 %1, %10, %18;"
|
|
|
|
|
"addc.cc.u32 %2, %11, %19;"
|
|
|
|
|
"addc.cc.u32 %3, %12, %20;"
|
|
|
|
|
"addc.cc.u32 %4, %13, %21;"
|
|
|
|
|
"addc.cc.u32 %5, %14, %22;"
|
|
|
|
|
"addc.cc.u32 %6, %15, %23;"
|
|
|
|
|
"addc.cc.u32 %7, %16, %24;"
|
|
|
|
|
"addc.u32 %8, 0, 0;"
|
|
|
|
|
: "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
|
|
|
|
|
"=r"(c)
|
|
|
|
|
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
|
|
|
|
|
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])
|
|
|
|
|
);
|
|
|
|
|
#elif defined IS_AMD && HAS_VADD == 1 && HAS_VADDC == 1
|
|
|
|
|
__asm__ __volatile__
|
|
|
|
|
(
|
|
|
|
|
"V_ADD_U32 %0, %9, %17;"
|
|
|
|
|
"V_ADDC_U32 %1, %10, %18;"
|
|
|
|
|
"V_ADDC_U32 %2, %11, %19;"
|
|
|
|
|
"V_ADDC_U32 %3, %12, %20;"
|
|
|
|
|
"V_ADDC_U32 %4, %13, %21;"
|
|
|
|
|
"V_ADDC_U32 %5, %14, %22;"
|
|
|
|
|
"V_ADDC_U32 %6, %15, %23;"
|
|
|
|
|
"V_ADDC_U32 %7, %16, %24;"
|
|
|
|
|
"V_ADDC_U32 %8, 0, 0;"
|
|
|
|
|
: "=v"(r[0]), "=v"(r[1]), "=v"(r[2]), "=v"(r[3]), "=v"(r[4]), "=v"(r[5]), "=v"(r[6]), "=v"(r[7]),
|
|
|
|
|
"=v"(c)
|
|
|
|
|
: "v"(a[0]), "v"(a[1]), "v"(a[2]), "v"(a[3]), "v"(a[4]), "v"(a[5]), "v"(a[6]), "v"(a[7]),
|
|
|
|
|
"v"(b[0]), "v"(b[1]), "v"(b[2]), "v"(b[3]), "v"(b[4]), "v"(b[5]), "v"(b[6]), "v"(b[7])
|
|
|
|
|
);
|
|
|
|
|
#else
|
|
|
|
|
for (u32 i = 0; i < 8; i++)
|
|
|
|
|
{
|
|
|
|
|