Inline hc_byte_perm in hc_bytealign

pull/1971/head
jsteube 5 years ago
parent 98f130e715
commit 9574862a19

@ -1068,76 +1068,22 @@ DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
{
u32x r;
const int c_mod_4 = c & 3;
const int c_minus_4 = 4 - c_mod_4;
#if CUDA_ARCH >= 350
const int c38 = c_minus_4 * 8;
#if VECT_SIZE == 1
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c38));
#endif
#if VECT_SIZE >= 2
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c38));
#endif
#if VECT_SIZE >= 4
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c38));
#endif
#if VECT_SIZE >= 8
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c38));
#endif
#if VECT_SIZE >= 16
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s8) : "r"(a.s8), "r"(b.s8), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s9) : "r"(a.s9), "r"(b.s9), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sa) : "r"(a.sa), "r"(b.sa), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sb) : "r"(a.sb), "r"(b.sb), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sc) : "r"(a.sc), "r"(b.sc), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sd) : "r"(a.sd), "r"(b.sd), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.se) : "r"(a.se), "r"(b.se), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sf) : "r"(a.sf), "r"(b.sf), "r"(c38));
#endif
#else
r = hc_byte_perm (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
#endif
const u32x r = hc_byte_perm (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
return r;
}
DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
{
u32 r;
const int c_mod_4 = c & 3;
const int c_minus_4 = 4 - c_mod_4;
#if CUDA_ARCH >= 350
const int c38 = c_minus_4 * 8;
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c38));
#else
r = hc_byte_perm_S (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
#endif
const u32 r = hc_byte_perm_S (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
return r;
}

Loading…
Cancel
Save