@ -351,71 +351,6 @@ static inline u64 rotl64_S (const u64 a, const u32 n)
return rotr64_S ( a , 64 - n ) ;
}
# if CUDA_ARCH >= 500
static inline u32 lut3_2d_S ( const u32 a , const u32 b , const u32 c )
{
u32 r ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
return r ;
}
static inline u32 lut3_39_S ( const u32 a , const u32 b , const u32 c )
{
u32 r ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
return r ;
}
static inline u32 lut3_59_S ( const u32 a , const u32 b , const u32 c )
{
u32 r ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
return r ;
}
static inline u32 lut3_96_S ( const u32 a , const u32 b , const u32 c )
{
u32 r ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
return r ;
}
static inline u32 lut3_e4_S ( const u32 a , const u32 b , const u32 c )
{
u32 r ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
return r ;
}
static inline u32 lut3_e8_S ( const u32 a , const u32 b , const u32 c )
{
u32 r ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
return r ;
}
static inline u32 lut3_ca_S ( const u32 a , const u32 b , const u32 c )
{
u32 r ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
return r ;
}
# endif
static inline u32 __byte_perm_S ( const u32 a , const u32 b , const u32 c )
{
u32 r ;
@ -850,281 +785,6 @@ static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
}
# endif
# if CUDA_ARCH >= 500
static inline u32x lut3_2d ( const u32x a , const u32x b , const u32x c )
{
u32x r ;
# if VECT_SIZE == 1
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
# endif
# if VECT_SIZE >= 2
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . s0 ) : " r " ( a . s0 ) , " r " ( b . s0 ) , " r " ( c . s0 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . s1 ) : " r " ( a . s1 ) , " r " ( b . s1 ) , " r " ( c . s1 ) ) ;
# endif
# if VECT_SIZE >= 4
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . s2 ) : " r " ( a . s2 ) , " r " ( b . s2 ) , " r " ( c . s2 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . s3 ) : " r " ( a . s3 ) , " r " ( b . s3 ) , " r " ( c . s3 ) ) ;
# endif
# if VECT_SIZE >= 8
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . s4 ) : " r " ( a . s4 ) , " r " ( b . s4 ) , " r " ( c . s4 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . s5 ) : " r " ( a . s5 ) , " r " ( b . s5 ) , " r " ( c . s5 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . s6 ) : " r " ( a . s6 ) , " r " ( b . s6 ) , " r " ( c . s6 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . s7 ) : " r " ( a . s7 ) , " r " ( b . s7 ) , " r " ( c . s7 ) ) ;
# endif
# if VECT_SIZE >= 16
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . s8 ) : " r " ( a . s8 ) , " r " ( b . s8 ) , " r " ( c . s8 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . s9 ) : " r " ( a . s9 ) , " r " ( b . s9 ) , " r " ( c . s9 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . sa ) : " r " ( a . sa ) , " r " ( b . sa ) , " r " ( c . sa ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . sb ) : " r " ( a . sb ) , " r " ( b . sb ) , " r " ( c . sb ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . sc ) : " r " ( a . sc ) , " r " ( b . sc ) , " r " ( c . sc ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . sd ) : " r " ( a . sd ) , " r " ( b . sd ) , " r " ( c . sd ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . se ) : " r " ( a . se ) , " r " ( b . se ) , " r " ( c . se ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x2d; " : " =r " ( r . sf ) : " r " ( a . sf ) , " r " ( b . sf ) , " r " ( c . sf ) ) ;
# endif
return r ;
}
static inline u32x lut3_39 ( const u32x a , const u32x b , const u32x c )
{
u32x r ;
# if VECT_SIZE == 1
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
# endif
# if VECT_SIZE >= 2
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . s0 ) : " r " ( a . s0 ) , " r " ( b . s0 ) , " r " ( c . s0 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . s1 ) : " r " ( a . s1 ) , " r " ( b . s1 ) , " r " ( c . s1 ) ) ;
# endif
# if VECT_SIZE >= 4
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . s2 ) : " r " ( a . s2 ) , " r " ( b . s2 ) , " r " ( c . s2 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . s3 ) : " r " ( a . s3 ) , " r " ( b . s3 ) , " r " ( c . s3 ) ) ;
# endif
# if VECT_SIZE >= 8
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . s4 ) : " r " ( a . s4 ) , " r " ( b . s4 ) , " r " ( c . s4 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . s5 ) : " r " ( a . s5 ) , " r " ( b . s5 ) , " r " ( c . s5 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . s6 ) : " r " ( a . s6 ) , " r " ( b . s6 ) , " r " ( c . s6 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . s7 ) : " r " ( a . s7 ) , " r " ( b . s7 ) , " r " ( c . s7 ) ) ;
# endif
# if VECT_SIZE >= 16
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . s8 ) : " r " ( a . s8 ) , " r " ( b . s8 ) , " r " ( c . s8 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . s9 ) : " r " ( a . s9 ) , " r " ( b . s9 ) , " r " ( c . s9 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . sa ) : " r " ( a . sa ) , " r " ( b . sa ) , " r " ( c . sa ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . sb ) : " r " ( a . sb ) , " r " ( b . sb ) , " r " ( c . sb ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . sc ) : " r " ( a . sc ) , " r " ( b . sc ) , " r " ( c . sc ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . sd ) : " r " ( a . sd ) , " r " ( b . sd ) , " r " ( c . sd ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . se ) : " r " ( a . se ) , " r " ( b . se ) , " r " ( c . se ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x39; " : " =r " ( r . sf ) : " r " ( a . sf ) , " r " ( b . sf ) , " r " ( c . sf ) ) ;
# endif
return r ;
}
static inline u32x lut3_59 ( const u32x a , const u32x b , const u32x c )
{
u32x r ;
# if VECT_SIZE == 1
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
# endif
# if VECT_SIZE >= 2
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . s0 ) : " r " ( a . s0 ) , " r " ( b . s0 ) , " r " ( c . s0 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . s1 ) : " r " ( a . s1 ) , " r " ( b . s1 ) , " r " ( c . s1 ) ) ;
# endif
# if VECT_SIZE >= 4
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . s2 ) : " r " ( a . s2 ) , " r " ( b . s2 ) , " r " ( c . s2 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . s3 ) : " r " ( a . s3 ) , " r " ( b . s3 ) , " r " ( c . s3 ) ) ;
# endif
# if VECT_SIZE >= 8
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . s4 ) : " r " ( a . s4 ) , " r " ( b . s4 ) , " r " ( c . s4 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . s5 ) : " r " ( a . s5 ) , " r " ( b . s5 ) , " r " ( c . s5 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . s6 ) : " r " ( a . s6 ) , " r " ( b . s6 ) , " r " ( c . s6 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . s7 ) : " r " ( a . s7 ) , " r " ( b . s7 ) , " r " ( c . s7 ) ) ;
# endif
# if VECT_SIZE >= 16
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . s8 ) : " r " ( a . s8 ) , " r " ( b . s8 ) , " r " ( c . s8 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . s9 ) : " r " ( a . s9 ) , " r " ( b . s9 ) , " r " ( c . s9 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . sa ) : " r " ( a . sa ) , " r " ( b . sa ) , " r " ( c . sa ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . sb ) : " r " ( a . sb ) , " r " ( b . sb ) , " r " ( c . sb ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . sc ) : " r " ( a . sc ) , " r " ( b . sc ) , " r " ( c . sc ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . sd ) : " r " ( a . sd ) , " r " ( b . sd ) , " r " ( c . sd ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . se ) : " r " ( a . se ) , " r " ( b . se ) , " r " ( c . se ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x59; " : " =r " ( r . sf ) : " r " ( a . sf ) , " r " ( b . sf ) , " r " ( c . sf ) ) ;
# endif
return r ;
}
static inline u32x lut3_96 ( const u32x a , const u32x b , const u32x c )
{
u32x r ;
# if VECT_SIZE == 1
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
# endif
# if VECT_SIZE >= 2
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . s0 ) : " r " ( a . s0 ) , " r " ( b . s0 ) , " r " ( c . s0 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . s1 ) : " r " ( a . s1 ) , " r " ( b . s1 ) , " r " ( c . s1 ) ) ;
# endif
# if VECT_SIZE >= 4
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . s2 ) : " r " ( a . s2 ) , " r " ( b . s2 ) , " r " ( c . s2 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . s3 ) : " r " ( a . s3 ) , " r " ( b . s3 ) , " r " ( c . s3 ) ) ;
# endif
# if VECT_SIZE >= 8
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . s4 ) : " r " ( a . s4 ) , " r " ( b . s4 ) , " r " ( c . s4 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . s5 ) : " r " ( a . s5 ) , " r " ( b . s5 ) , " r " ( c . s5 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . s6 ) : " r " ( a . s6 ) , " r " ( b . s6 ) , " r " ( c . s6 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . s7 ) : " r " ( a . s7 ) , " r " ( b . s7 ) , " r " ( c . s7 ) ) ;
# endif
# if VECT_SIZE >= 16
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . s8 ) : " r " ( a . s8 ) , " r " ( b . s8 ) , " r " ( c . s8 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . s9 ) : " r " ( a . s9 ) , " r " ( b . s9 ) , " r " ( c . s9 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . sa ) : " r " ( a . sa ) , " r " ( b . sa ) , " r " ( c . sa ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . sb ) : " r " ( a . sb ) , " r " ( b . sb ) , " r " ( c . sb ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . sc ) : " r " ( a . sc ) , " r " ( b . sc ) , " r " ( c . sc ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . sd ) : " r " ( a . sd ) , " r " ( b . sd ) , " r " ( c . sd ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . se ) : " r " ( a . se ) , " r " ( b . se ) , " r " ( c . se ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0x96; " : " =r " ( r . sf ) : " r " ( a . sf ) , " r " ( b . sf ) , " r " ( c . sf ) ) ;
# endif
return r ;
}
static inline u32x lut3_e4 ( const u32x a , const u32x b , const u32x c )
{
u32x r ;
# if VECT_SIZE == 1
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
# endif
# if VECT_SIZE >= 2
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . s0 ) : " r " ( a . s0 ) , " r " ( b . s0 ) , " r " ( c . s0 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . s1 ) : " r " ( a . s1 ) , " r " ( b . s1 ) , " r " ( c . s1 ) ) ;
# endif
# if VECT_SIZE >= 4
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . s2 ) : " r " ( a . s2 ) , " r " ( b . s2 ) , " r " ( c . s2 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . s3 ) : " r " ( a . s3 ) , " r " ( b . s3 ) , " r " ( c . s3 ) ) ;
# endif
# if VECT_SIZE >= 8
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . s4 ) : " r " ( a . s4 ) , " r " ( b . s4 ) , " r " ( c . s4 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . s5 ) : " r " ( a . s5 ) , " r " ( b . s5 ) , " r " ( c . s5 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . s6 ) : " r " ( a . s6 ) , " r " ( b . s6 ) , " r " ( c . s6 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . s7 ) : " r " ( a . s7 ) , " r " ( b . s7 ) , " r " ( c . s7 ) ) ;
# endif
# if VECT_SIZE >= 16
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . s8 ) : " r " ( a . s8 ) , " r " ( b . s8 ) , " r " ( c . s8 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . s9 ) : " r " ( a . s9 ) , " r " ( b . s9 ) , " r " ( c . s9 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . sa ) : " r " ( a . sa ) , " r " ( b . sa ) , " r " ( c . sa ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . sb ) : " r " ( a . sb ) , " r " ( b . sb ) , " r " ( c . sb ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . sc ) : " r " ( a . sc ) , " r " ( b . sc ) , " r " ( c . sc ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . sd ) : " r " ( a . sd ) , " r " ( b . sd ) , " r " ( c . sd ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . se ) : " r " ( a . se ) , " r " ( b . se ) , " r " ( c . se ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe4; " : " =r " ( r . sf ) : " r " ( a . sf ) , " r " ( b . sf ) , " r " ( c . sf ) ) ;
# endif
return r ;
}
static inline u32x lut3_e8 ( const u32x a , const u32x b , const u32x c )
{
u32x r ;
# if VECT_SIZE == 1
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
# endif
# if VECT_SIZE >= 2
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . s0 ) : " r " ( a . s0 ) , " r " ( b . s0 ) , " r " ( c . s0 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . s1 ) : " r " ( a . s1 ) , " r " ( b . s1 ) , " r " ( c . s1 ) ) ;
# endif
# if VECT_SIZE >= 4
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . s2 ) : " r " ( a . s2 ) , " r " ( b . s2 ) , " r " ( c . s2 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . s3 ) : " r " ( a . s3 ) , " r " ( b . s3 ) , " r " ( c . s3 ) ) ;
# endif
# if VECT_SIZE >= 8
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . s4 ) : " r " ( a . s4 ) , " r " ( b . s4 ) , " r " ( c . s4 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . s5 ) : " r " ( a . s5 ) , " r " ( b . s5 ) , " r " ( c . s5 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . s6 ) : " r " ( a . s6 ) , " r " ( b . s6 ) , " r " ( c . s6 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . s7 ) : " r " ( a . s7 ) , " r " ( b . s7 ) , " r " ( c . s7 ) ) ;
# endif
# if VECT_SIZE >= 16
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . s8 ) : " r " ( a . s8 ) , " r " ( b . s8 ) , " r " ( c . s8 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . s9 ) : " r " ( a . s9 ) , " r " ( b . s9 ) , " r " ( c . s9 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . sa ) : " r " ( a . sa ) , " r " ( b . sa ) , " r " ( c . sa ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . sb ) : " r " ( a . sb ) , " r " ( b . sb ) , " r " ( c . sb ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . sc ) : " r " ( a . sc ) , " r " ( b . sc ) , " r " ( c . sc ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . sd ) : " r " ( a . sd ) , " r " ( b . sd ) , " r " ( c . sd ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . se ) : " r " ( a . se ) , " r " ( b . se ) , " r " ( c . se ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xe8; " : " =r " ( r . sf ) : " r " ( a . sf ) , " r " ( b . sf ) , " r " ( c . sf ) ) ;
# endif
return r ;
}
static inline u32x lut3_ca ( const u32x a , const u32x b , const u32x c )
{
u32x r ;
# if VECT_SIZE == 1
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r ) : " r " ( a ) , " r " ( b ) , " r " ( c ) ) ;
# endif
# if VECT_SIZE >= 2
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . s0 ) : " r " ( a . s0 ) , " r " ( b . s0 ) , " r " ( c . s0 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . s1 ) : " r " ( a . s1 ) , " r " ( b . s1 ) , " r " ( c . s1 ) ) ;
# endif
# if VECT_SIZE >= 4
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . s2 ) : " r " ( a . s2 ) , " r " ( b . s2 ) , " r " ( c . s2 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . s3 ) : " r " ( a . s3 ) , " r " ( b . s3 ) , " r " ( c . s3 ) ) ;
# endif
# if VECT_SIZE >= 8
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . s4 ) : " r " ( a . s4 ) , " r " ( b . s4 ) , " r " ( c . s4 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . s5 ) : " r " ( a . s5 ) , " r " ( b . s5 ) , " r " ( c . s5 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . s6 ) : " r " ( a . s6 ) , " r " ( b . s6 ) , " r " ( c . s6 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . s7 ) : " r " ( a . s7 ) , " r " ( b . s7 ) , " r " ( c . s7 ) ) ;
# endif
# if VECT_SIZE >= 16
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . s8 ) : " r " ( a . s8 ) , " r " ( b . s8 ) , " r " ( c . s8 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . s9 ) : " r " ( a . s9 ) , " r " ( b . s9 ) , " r " ( c . s9 ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . sa ) : " r " ( a . sa ) , " r " ( b . sa ) , " r " ( c . sa ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . sb ) : " r " ( a . sb ) , " r " ( b . sb ) , " r " ( c . sb ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . sc ) : " r " ( a . sc ) , " r " ( b . sc ) , " r " ( c . sc ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . sd ) : " r " ( a . sd ) , " r " ( b . sd ) , " r " ( c . sd ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . se ) : " r " ( a . se ) , " r " ( b . se ) , " r " ( c . se ) ) ;
asm ( " lop3.b32 %0, %1, %2, %3, 0xca; " : " =r " ( r . sf ) : " r " ( a . sf ) , " r " ( b . sf ) , " r " ( c . sf ) ) ;
# endif
return r ;
}
# endif
# endif
# ifdef IS_GENERIC