1
0
mirror of https://github.com/hashcat/hashcat.git synced 2025-07-14 18:48:21 +00:00

reorder functions for better overview

This commit is contained in:
jsteube 2019-03-24 15:23:11 +01:00
parent 194fd7e6d1
commit 13097fefc7
2 changed files with 122 additions and 103 deletions

View File

@ -586,7 +586,7 @@ DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
} }
#if HAS_VPERM #if HAS_VPERM
DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const u32x c) DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const int c)
{ {
u32x r; u32x r;
@ -595,51 +595,51 @@ DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const u32x c)
#endif #endif
#if VECT_SIZE >= 2 #if VECT_SIZE >= 2
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c));
#endif #endif
#if VECT_SIZE >= 4 #if VECT_SIZE >= 4
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c));
#endif #endif
#if VECT_SIZE >= 8 #if VECT_SIZE >= 8
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c));
#endif #endif
#if VECT_SIZE >= 16 #if VECT_SIZE >= 16
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s8) : "v"(b.s8), "v"(a.s8), "v"(c.s8)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s8) : "v"(b.s8), "v"(a.s8), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s9) : "v"(b.s9), "v"(a.s9), "v"(c.s9)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s9) : "v"(b.s9), "v"(a.s9), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sa) : "v"(b.sa), "v"(a.sa), "v"(c.sa)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sa) : "v"(b.sa), "v"(a.sa), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sb) : "v"(b.sb), "v"(a.sb), "v"(c.sb)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sb) : "v"(b.sb), "v"(a.sb), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sc) : "v"(b.sc), "v"(a.sc), "v"(c.sc)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sc) : "v"(b.sc), "v"(a.sc), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sd) : "v"(b.sd), "v"(a.sd), "v"(c.sd)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sd) : "v"(b.sd), "v"(a.sd), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.se) : "v"(b.se), "v"(a.se), "v"(c.se)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.se) : "v"(b.se), "v"(a.se), "v"(c));
__asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sf) : "v"(b.sf), "v"(a.sf), "v"(c.sf)); __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sf) : "v"(b.sf), "v"(a.sf), "v"(c));
#endif #endif
return r; return r;
} }
DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const u32 c) DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const int c)
{ {
u32 r; u32 r;
@ -970,46 +970,46 @@ DECLSPEC u64x hc_rotl64 (const u64x a, const int n)
return rotate (a, (u64x) n); return rotate (a, (u64x) n);
} }
DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const u32x c) DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const int c)
{ {
u32x r; u32x r;
#if VECT_SIZE == 1 #if VECT_SIZE == 1
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c) ); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#endif #endif
#if VECT_SIZE >= 2 #if VECT_SIZE >= 2
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c));
#endif #endif
#if VECT_SIZE >= 4 #if VECT_SIZE >= 4
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c));
#endif #endif
#if VECT_SIZE >= 8 #if VECT_SIZE >= 8
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c.s4)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c.s5)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c.s6)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c.s7)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c));
#endif #endif
#if VECT_SIZE >= 16 #if VECT_SIZE >= 16
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s8) : "r"(a.s8), "r"(b.s8), "r"(c.s8)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s8) : "r"(a.s8), "r"(b.s8), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s9) : "r"(a.s9), "r"(b.s9), "r"(c.s9)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s9) : "r"(a.s9), "r"(b.s9), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sa) : "r"(a.sa), "r"(b.sa), "r"(c.sa)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sa) : "r"(a.sa), "r"(b.sa), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sb) : "r"(a.sb), "r"(b.sb), "r"(c.sb)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sb) : "r"(a.sb), "r"(b.sb), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sc) : "r"(a.sc), "r"(b.sc), "r"(c.sc)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sc) : "r"(a.sc), "r"(b.sc), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sd) : "r"(a.sd), "r"(b.sd), "r"(c.sd)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sd) : "r"(a.sd), "r"(b.sd), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.se) : "r"(a.se), "r"(b.se), "r"(c.se)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.se) : "r"(a.se), "r"(b.se), "r"(c));
asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sf) : "r"(a.sf), "r"(b.sf), "r"(c.sf)); asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sf) : "r"(a.sf), "r"(b.sf), "r"(c));
#endif #endif
return r; return r;
} }
DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const u32 c) DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const int c)
{ {
u32 r; u32 r;
@ -1072,41 +1072,43 @@ DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
#if CUDA_ARCH >= 350 #if CUDA_ARCH >= 350
const int c38 = (c & 3) * 8;
#if VECT_SIZE == 1 #if VECT_SIZE == 1
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"(c38));
#endif #endif
#if VECT_SIZE >= 2 #if VECT_SIZE >= 2
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(b.s0), "r"(a.s0), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(b.s0), "r"(a.s0), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(b.s1), "r"(a.s1), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(b.s1), "r"(a.s1), "r"(c38));
#endif #endif
#if VECT_SIZE >= 4 #if VECT_SIZE >= 4
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(b.s2), "r"(a.s2), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(b.s2), "r"(a.s2), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(b.s3), "r"(a.s3), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(b.s3), "r"(a.s3), "r"(c38));
#endif #endif
#if VECT_SIZE >= 8 #if VECT_SIZE >= 8
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(b.s4), "r"(a.s4), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(b.s4), "r"(a.s4), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(b.s5), "r"(a.s5), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(b.s5), "r"(a.s5), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(b.s6), "r"(a.s6), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(b.s6), "r"(a.s6), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(b.s7), "r"(a.s7), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(b.s7), "r"(a.s7), "r"(c38));
#endif #endif
#if VECT_SIZE >= 16 #if VECT_SIZE >= 16
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s8) : "r"(b.s8), "r"(a.s8), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s8) : "r"(b.s8), "r"(a.s8), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s9) : "r"(b.s9), "r"(a.s9), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s9) : "r"(b.s9), "r"(a.s9), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sa) : "r"(b.sa), "r"(a.sa), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sa) : "r"(b.sa), "r"(a.sa), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sb) : "r"(b.sb), "r"(a.sb), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sb) : "r"(b.sb), "r"(a.sb), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sc) : "r"(b.sc), "r"(a.sc), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sc) : "r"(b.sc), "r"(a.sc), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sd) : "r"(b.sd), "r"(a.sd), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sd) : "r"(b.sd), "r"(a.sd), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.se) : "r"(b.se), "r"(a.se), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.se) : "r"(b.se), "r"(a.se), "r"(c38));
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sf) : "r"(b.sf), "r"(a.sf), "r"((c & 3) * 8)); asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sf) : "r"(b.sf), "r"(a.sf), "r"(c38));
#endif #endif
#else #else
r = hc_byte_perm (b, a, ((u32x) (0x76543210) >> ((c & 3) * 4)) & 0xffff); r = hc_byte_perm (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
#endif #endif
@ -1119,7 +1121,9 @@ DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
#if CUDA_ARCH >= 350 #if CUDA_ARCH >= 350
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"((c & 3) * 8)); const int c38 = (c & 3) * 8;
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"(c38));
#else #else

View File

@ -82,18 +82,40 @@
#define KERN_ATTR_VECTOR() KERN_ATTR (GLOBAL_AS, CONSTANT_AS const u32x * restrict words_buf_r, void, void, void) #define KERN_ATTR_VECTOR() KERN_ATTR (GLOBAL_AS, CONSTANT_AS const u32x * restrict words_buf_r, void, void, void)
#define KERN_ATTR_VECTOR_ESALT(e) KERN_ATTR (GLOBAL_AS, CONSTANT_AS const u32x * restrict words_buf_r, void, void, e) #define KERN_ATTR_VECTOR_ESALT(e) KERN_ATTR (GLOBAL_AS, CONSTANT_AS const u32x * restrict words_buf_r, void, void, e)
DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c); // union based packing
DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c);
DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c); DECLSPEC u8 v8a_from_v32_S (const u32 v32);
DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c); DECLSPEC u8 v8b_from_v32_S (const u32 v32);
DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c); DECLSPEC u8 v8c_from_v32_S (const u32 v32);
DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c); DECLSPEC u8 v8d_from_v32_S (const u32 v32);
DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c);
DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c); DECLSPEC u16 v16a_from_v32_S (const u32 v32);
DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const u32x c); DECLSPEC u16 v16b_from_v32_S (const u32 v32);
DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const u32 c);
DECLSPEC u32x hc_lop_0x96 (const u32x a, const u32x b, const u32x c); DECLSPEC u32 v32a_from_v64_S (const u64 v64);
DECLSPEC u32 hc_lop_0x96_S (const u32 a, const u32 b, const u32 c); DECLSPEC u32 v32b_from_v64_S (const u64 v64);
DECLSPEC u32 v32_from_v16ab_S (const u16 v16a, const u16 v16b);
DECLSPEC u64 v64_from_v32ab_S (const u32 v32a, const u32 v32b);
// inline asm packing
DECLSPEC u32 unpack_v8a_from_v32_S (const u32 v32);
DECLSPEC u32 unpack_v8b_from_v32_S (const u32 v32);
DECLSPEC u32 unpack_v8c_from_v32_S (const u32 v32);
DECLSPEC u32 unpack_v8d_from_v32_S (const u32 v32);
// opencl intern based packing
DECLSPEC u32x l32_from_64 (u64x a);
DECLSPEC u32x h32_from_64 (u64x a);
DECLSPEC u32 l32_from_64_S (u64 a);
DECLSPEC u32 h32_from_64_S (u64 a);
DECLSPEC u64x hl32_to_64 (const u32x a, const u32x b);
DECLSPEC u64 hl32_to_64_S (const u32 a, const u32 b);
// bit operations
DECLSPEC u32x hc_rotl32 (const u32x a, const int n); DECLSPEC u32x hc_rotl32 (const u32x a, const int n);
DECLSPEC u32x hc_rotl32 (const u32x a, const int n); DECLSPEC u32x hc_rotl32 (const u32x a, const int n);
@ -117,30 +139,23 @@ DECLSPEC u32 hc_swap32_S (const u32 v);
DECLSPEC u64x hc_swap64 (const u64x v); DECLSPEC u64x hc_swap64 (const u64x v);
DECLSPEC u64 hc_swap64_S (const u64 v); DECLSPEC u64 hc_swap64_S (const u64 v);
DECLSPEC u32x l32_from_64 (u64x a); // byte operations
DECLSPEC u32x h32_from_64 (u64x a);
DECLSPEC u32 l32_from_64_S (u64 a);
DECLSPEC u32 h32_from_64_S (u64 a);
DECLSPEC u64x hl32_to_64 (const u32x a, const u32x b); DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c);
DECLSPEC u64 hl32_to_64_S (const u32 a, const u32 b); DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c);
DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c);
DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c);
DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const int c);
DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const int c);
DECLSPEC u8 v8a_from_v32_S (const u32 v32); DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c);
DECLSPEC u8 v8b_from_v32_S (const u32 v32); DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c);
DECLSPEC u8 v8c_from_v32_S (const u32 v32); DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c);
DECLSPEC u8 v8d_from_v32_S (const u32 v32); DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c);
DECLSPEC u16 v16a_from_v32_S (const u32 v32); DECLSPEC u32x hc_lop_0x96 (const u32x a, const u32x b, const u32x c);
DECLSPEC u16 v16b_from_v32_S (const u32 v32); DECLSPEC u32 hc_lop_0x96_S (const u32 a, const u32 b, const u32 c);
DECLSPEC u32 v32a_from_v64_S (const u64 v64);
DECLSPEC u32 v32b_from_v64_S (const u64 v64);
DECLSPEC u32 v32_from_v16ab_S (const u16 v16a, const u16 v16b); // legacy common code
DECLSPEC u64 v64_from_v32ab_S (const u32 v32a, const u32 v32b);
DECLSPEC u32 unpack_v8a_from_v32_S (const u32 v32);
DECLSPEC u32 unpack_v8b_from_v32_S (const u32 v32);
DECLSPEC u32 unpack_v8c_from_v32_S (const u32 v32);
DECLSPEC u32 unpack_v8d_from_v32_S (const u32 v32);
DECLSPEC int ffz (const u32 v); DECLSPEC int ffz (const u32 v);
DECLSPEC int hash_comp (const u32 *d1, GLOBAL_AS const u32 *d2); DECLSPEC int hash_comp (const u32 *d1, GLOBAL_AS const u32 *d2);