diff --git a/OpenCL/amp_a0.cl b/OpenCL/amp_a0.cl
index 2ae36b581..e09e3f6d8 100644
--- a/OpenCL/amp_a0.cl
+++ b/OpenCL/amp_a0.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/amp_a1.cl b/OpenCL/amp_a1.cl
index 5dda031bf..5ab241211 100644
--- a/OpenCL/amp_a1.cl
+++ b/OpenCL/amp_a1.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #endif
 
diff --git a/OpenCL/amp_a3.cl b/OpenCL/amp_a3.cl
index d7ce0ea77..075e3c6f3 100644
--- a/OpenCL/amp_a3.cl
+++ b/OpenCL/amp_a3.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #endif
 
 KERNEL_FQ void amp (GLOBAL_AS pw_t *pws, GLOBAL_AS pw_t *pws_amp, GLOBAL_AS const kernel_rule_t *rules_buf, GLOBAL_AS const pw_t *combs_buf, CONSTANT_AS bf_t *bfs_buf, const u32 combs_mode, const u64 gid_max)
diff --git a/OpenCL/inc_cipher_aes.cl b/OpenCL/inc_cipher_aes.cl
index 4ca9f937e..716addc39 100644
--- a/OpenCL/inc_cipher_aes.cl
+++ b/OpenCL/inc_cipher_aes.cl
@@ -5,10 +5,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_cipher_aes.h"
 
-CONSTANT_AS u32a te0[256] =
+CONSTANT_VK u32a te0[256] =
 {
   0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
   0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
@@ -76,7 +77,7 @@ CONSTANT_AS u32a te0[256] =
   0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a,
 };
 
-CONSTANT_AS u32a te1[256] =
+CONSTANT_VK u32a te1[256] =
 {
   0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b,
   0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5,
@@ -144,7 +145,7 @@ CONSTANT_AS u32a te1[256] =
   0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616,
 };
 
-CONSTANT_AS u32a te2[256] =
+CONSTANT_VK u32a te2[256] =
 {
   0x63a5c663, 0x7c84f87c, 0x7799ee77, 0x7b8df67b,
   0xf20dfff2, 0x6bbdd66b, 0x6fb1de6f, 0xc55491c5,
@@ -212,7 +213,7 @@ CONSTANT_AS u32a te2[256] =
   0xb0cb7bb0, 0x54fca854, 0xbbd66dbb, 0x163a2c16,
 };
 
-CONSTANT_AS u32a te3[256] =
+CONSTANT_VK u32a te3[256] =
 {
   0x6363a5c6, 0x7c7c84f8, 0x777799ee, 0x7b7b8df6,
   0xf2f20dff, 0x6b6bbdd6, 0x6f6fb1de, 0xc5c55491,
@@ -280,7 +281,7 @@ CONSTANT_AS u32a te3[256] =
   0xb0b0cb7b, 0x5454fca8, 0xbbbbd66d, 0x16163a2c,
 };
 
-CONSTANT_AS u32a te4[256] =
+CONSTANT_VK u32a te4[256] =
 {
   0x63636363, 0x7c7c7c7c, 0x77777777, 0x7b7b7b7b,
   0xf2f2f2f2, 0x6b6b6b6b, 0x6f6f6f6f, 0xc5c5c5c5,
@@ -348,7 +349,7 @@ CONSTANT_AS u32a te4[256] =
   0xb0b0b0b0, 0x54545454, 0xbbbbbbbb, 0x16161616,
 };
 
-CONSTANT_AS u32a td0[256] =
+CONSTANT_VK u32a td0[256] =
 {
   0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
   0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
@@ -416,7 +417,7 @@ CONSTANT_AS u32a td0[256] =
   0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742,
 };
 
-CONSTANT_AS u32a td1[256] =
+CONSTANT_VK u32a td1[256] =
 {
   0x5051f4a7, 0x537e4165, 0xc31a17a4, 0x963a275e,
   0xcb3bab6b, 0xf11f9d45, 0xabacfa58, 0x934be303,
@@ -484,7 +485,7 @@ CONSTANT_AS u32a td1[256] =
   0x617bcb84, 0x70d532b6, 0x74486c5c, 0x42d0b857,
 };
 
-CONSTANT_AS u32a td2[256] =
+CONSTANT_VK u32a td2[256] =
 {
   0xa75051f4, 0x65537e41, 0xa4c31a17, 0x5e963a27,
   0x6bcb3bab, 0x45f11f9d, 0x58abacfa, 0x03934be3,
@@ -552,7 +553,7 @@ CONSTANT_AS u32a td2[256] =
   0x84617bcb, 0xb670d532, 0x5c74486c, 0x5742d0b8,
 };
 
-CONSTANT_AS u32a td3[256] =
+CONSTANT_VK u32a td3[256] =
 {
   0xf4a75051, 0x4165537e, 0x17a4c31a, 0x275e963a,
   0xab6bcb3b, 0x9d45f11f, 0xfa58abac, 0xe303934b,
@@ -620,7 +621,7 @@ CONSTANT_AS u32a td3[256] =
   0xcb84617b, 0x32b670d5, 0x6c5c7448, 0xb85742d0,
 };
 
-CONSTANT_AS u32a td4[256] =
+CONSTANT_VK u32a td4[256] =
 {
   0x52525252, 0x09090909, 0x6a6a6a6a, 0xd5d5d5d5,
   0x30303030, 0x36363636, 0xa5a5a5a5, 0x38383838,
diff --git a/OpenCL/inc_cipher_camellia.cl b/OpenCL/inc_cipher_camellia.cl
index 7e08c163b..08c73e6f4 100644
--- a/OpenCL/inc_cipher_camellia.cl
+++ b/OpenCL/inc_cipher_camellia.cl
@@ -17,10 +17,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_cipher_camellia.h"
 
-CONSTANT_AS u32a c_sbox[256] =
+CONSTANT_VK u32a c_sbox[256] =
 {
   0x70, 0x82, 0x2c, 0xec, 0xb3, 0x27, 0xc0, 0xe5,
   0xe4, 0x85, 0x57, 0x35, 0xea, 0x0c, 0xae, 0x41,
diff --git a/OpenCL/inc_cipher_des.cl b/OpenCL/inc_cipher_des.cl
index ec8a1e611..6c2e7fca8 100644
--- a/OpenCL/inc_cipher_des.cl
+++ b/OpenCL/inc_cipher_des.cl
@@ -5,10 +5,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_cipher_des.h"
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     /* nibble 0 */
@@ -164,7 +165,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -322,6 +323,9 @@ DECLSPEC void _des_crypt_encrypt (u32 *out, const u32 *in, const u32 *Kc, const
   r = hc_rotl32_S (r, 3u);
   l = hc_rotl32_S (l, 3u);
 
+  #ifdef _unroll
+  #pragma unroll
+  #endif
   for (u32 i = 0; i < 16; i += 2)
   {
     u32 u;
@@ -371,6 +375,9 @@ DECLSPEC void _des_crypt_decrypt (u32 *out, const u32 *in, const u32 *Kc, const
   r = hc_rotl32_S (r, 3u);
   l = hc_rotl32_S (l, 3u);
 
+  #ifdef _unroll
+  #pragma unroll
+  #endif
   for (u32 i = 16; i > 0; i -= 2)
   {
     u32 u;
@@ -426,6 +433,9 @@ DECLSPEC void _des_crypt_keysetup (u32 c, u32 d, u32 *Kc, u32 *Kd, SHM_TYPE u32
 
   c = c & 0x0fffffff;
 
+  #ifdef _unroll
+  #pragma unroll
+  #endif
   for (u32 i = 0; i < 16; i++)
   {
     if ((i < 2) || (i == 8) || (i == 15))
@@ -487,6 +497,9 @@ DECLSPEC void _des_crypt_encrypt_vect (u32x *out, const u32x *in, const u32x *Kc
   r = hc_rotl32 (r, 3u);
   l = hc_rotl32 (l, 3u);
 
+  #ifdef _unroll
+  #pragma unroll
+  #endif
   for (u32 i = 0; i < 16; i += 2)
   {
     u32x u;
@@ -536,6 +549,9 @@ DECLSPEC void _des_crypt_decrypt_vect (u32x *out, const u32x *in, const u32x *Kc
   r = hc_rotl32 (r, 3u);
   l = hc_rotl32 (l, 3u);
 
+  #ifdef _unroll
+  #pragma unroll
+  #endif
   for (u32 i = 16; i > 0; i -= 2)
   {
     u32x u;
@@ -591,6 +607,9 @@ DECLSPEC void _des_crypt_keysetup_vect (u32x c, u32x d, u32x *Kc, u32x *Kd, SHM_
 
   c = c & 0x0fffffff;
 
+  #ifdef _unroll
+  #pragma unroll
+  #endif
   for (u32 i = 0; i < 16; i++)
   {
     if ((i < 2) || (i == 8) || (i == 15))
diff --git a/OpenCL/inc_cipher_des.h b/OpenCL/inc_cipher_des.h
index d1bc516e7..c93ff422c 100644
--- a/OpenCL/inc_cipher_des.h
+++ b/OpenCL/inc_cipher_des.h
@@ -92,13 +92,13 @@
 #if   VECT_SIZE == 1
 #define DES_BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define DES_BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define DES_BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define DES_BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define DES_BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define DES_BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define DES_BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define DES_BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define DES_BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32 *out, const u32 *in, const u32 *Kc, const u32 *Kd, SHM_TYPE u32 (*s_SPtrans)[64]);
diff --git a/OpenCL/inc_cipher_kuznyechik.cl b/OpenCL/inc_cipher_kuznyechik.cl
index 6147bf8d0..3d9638e23 100644
--- a/OpenCL/inc_cipher_kuznyechik.cl
+++ b/OpenCL/inc_cipher_kuznyechik.cl
@@ -14,10 +14,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_cipher_kuznyechik.h"
 
-CONSTANT_AS u32a k_sbox[256] =
+CONSTANT_VK u32a k_sbox[256] =
 {
   0xfc, 0xee, 0xdd, 0x11, 0xcf, 0x6e, 0x31, 0x16,
   0xfb, 0xc4, 0xfa, 0xda, 0x23, 0xc5, 0x04, 0x4d,
@@ -53,7 +54,7 @@ CONSTANT_AS u32a k_sbox[256] =
   0xd1, 0x66, 0xaf, 0xc2, 0x39, 0x4b, 0x63, 0xb6
 };
 
-CONSTANT_AS u32a k_sbox_inv[256] =
+CONSTANT_VK u32a k_sbox_inv[256] =
 {
   0xa5, 0x2d, 0x32, 0x8f, 0x0e, 0x30, 0x38, 0xc0,
   0x54, 0xe6, 0x9e, 0x39, 0x55, 0x7e, 0x52, 0x91,
@@ -91,12 +92,23 @@ CONSTANT_AS u32a k_sbox_inv[256] =
 
 #define extract_byte(x,n) (((x) >> (8 * (n))) & 0xff)
 
-#define k_lookup(w,sbox)                      \
-  for (int i = 0; i < 4; i++)                 \
-    w[i] = sbox[extract_byte (w[i], 0)] <<  0 \
-         | sbox[extract_byte (w[i], 1)] <<  8 \
-         | sbox[extract_byte (w[i], 2)] << 16 \
-         | sbox[extract_byte (w[i], 3)] << 24
+#define k_lookup(w,sbox)                        \
+    w[0] = sbox[extract_byte (w[0], 0)] <<  0   \
+         | sbox[extract_byte (w[0], 1)] <<  8   \
+         | sbox[extract_byte (w[0], 2)] << 16   \
+         | sbox[extract_byte (w[0], 3)] << 24;  \
+    w[1] = sbox[extract_byte (w[1], 0)] <<  0   \
+         | sbox[extract_byte (w[1], 1)] <<  8   \
+         | sbox[extract_byte (w[1], 2)] << 16   \
+         | sbox[extract_byte (w[1], 3)] << 24;  \
+    w[2] = sbox[extract_byte (w[2], 0)] <<  0   \
+         | sbox[extract_byte (w[2], 1)] <<  8   \
+         | sbox[extract_byte (w[2], 2)] << 16   \
+         | sbox[extract_byte (w[2], 3)] << 24;  \
+    w[3] = sbox[extract_byte (w[3], 0)] <<  0   \
+         | sbox[extract_byte (w[3], 1)] <<  8   \
+         | sbox[extract_byte (w[3], 2)] << 16   \
+         | sbox[extract_byte (w[3], 3)] << 24;
 
 #define k_xor(n)                      \
   for (int i = (n); i > 0; i /= 2)    \
diff --git a/OpenCL/inc_cipher_serpent.cl b/OpenCL/inc_cipher_serpent.cl
index 5bdb3c3d4..b4e21e2e0 100644
--- a/OpenCL/inc_cipher_serpent.cl
+++ b/OpenCL/inc_cipher_serpent.cl
@@ -18,6 +18,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_cipher_serpent.h"
 
diff --git a/OpenCL/inc_cipher_twofish.cl b/OpenCL/inc_cipher_twofish.cl
index 2875f5fa9..5ba55e341 100644
--- a/OpenCL/inc_cipher_twofish.cl
+++ b/OpenCL/inc_cipher_twofish.cl
@@ -21,10 +21,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_cipher_twofish.h"
 
-CONSTANT_AS u32a q_tab[2][256] =
+CONSTANT_VK u32a q_tab[2][256] =
 {
   {
     0xA9, 0x67, 0xB3, 0xE8, 0x04, 0xFD, 0xA3, 0x76, 0x9A, 0x92, 0x80, 0x78,
@@ -76,7 +77,7 @@ CONSTANT_AS u32a q_tab[2][256] =
   }
 };
 
-CONSTANT_AS u32a  m_tab[4][256] =
+CONSTANT_VK u32a  m_tab[4][256] =
 {
   { 0xBCBC3275, 0xECEC21F3, 0x202043C6, 0xB3B3C9F4, 0xDADA03DB, 0x02028B7B,
     0xE2E22BFB, 0x9E9EFAC8, 0xC9C9EC4A, 0xD4D409D3, 0x18186BE6, 0x1E1E9F6B,
@@ -392,6 +393,9 @@ DECLSPEC void twofish128_set_key (u32 *sk, u32 *lk, const u32 *ukey)
   sk[1] = mds_rem (me_key[0], mo_key[0]);
   sk[0] = mds_rem (me_key[1], mo_key[1]);
 
+  #ifdef _unroll
+  #pragma unroll
+  #endif
   for (int i = 0; i < 40; i += 2)
   {
     u32 a = 0x01010101 * i;
@@ -517,6 +521,9 @@ DECLSPEC void twofish256_set_key (u32 *sk, u32 *lk, const u32 *ukey)
   sk[1] = mds_rem (me_key[2], mo_key[2]);
   sk[0] = mds_rem (me_key[3], mo_key[3]);
 
+  #ifdef _unroll
+  #pragma unroll
+  #endif
   for (int i = 0; i < 40; i += 2)
   {
     u32 a = 0x01010101 * i;
diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl
index d497d349e..9e1831d2f 100644
--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 
 /**
@@ -17,7 +18,7 @@ DECLSPEC u8 v8a_from_v32_S (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v8a;
+  return v.v8.a;
 }
 
 DECLSPEC u8 v8b_from_v32_S (const u32 v32)
@@ -26,7 +27,7 @@ DECLSPEC u8 v8b_from_v32_S (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v8b;
+  return v.v8.b;
 }
 
 DECLSPEC u8 v8c_from_v32_S (const u32 v32)
@@ -35,7 +36,7 @@ DECLSPEC u8 v8c_from_v32_S (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v8c;
+  return v.v8.c;
 }
 
 DECLSPEC u8 v8d_from_v32_S (const u32 v32)
@@ -44,7 +45,7 @@ DECLSPEC u8 v8d_from_v32_S (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v8d;
+  return v.v8.d;
 }
 
 DECLSPEC u16 v16a_from_v32_S (const u32 v32)
@@ -53,7 +54,7 @@ DECLSPEC u16 v16a_from_v32_S (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v16a;
+  return v.v16.a;
 }
 
 DECLSPEC u16 v16b_from_v32_S (const u32 v32)
@@ -62,15 +63,15 @@ DECLSPEC u16 v16b_from_v32_S (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v16b;
+  return v.v16.b;
 }
 
 DECLSPEC u32 v32_from_v16ab_S (const u16 v16a, const u16 v16b)
 {
   vconv32_t v;
 
-  v.v16a = v16a;
-  v.v16b = v16b;
+  v.v16.a = v16a;
+  v.v16.b = v16b;
 
   return v.v32;
 }
@@ -81,7 +82,7 @@ DECLSPEC u32 v32a_from_v64_S (const u64 v64)
 
   v.v64 = v64;
 
-  return v.v32a;
+  return v.v32.a;
 }
 
 DECLSPEC u32 v32b_from_v64_S (const u64 v64)
@@ -90,15 +91,15 @@ DECLSPEC u32 v32b_from_v64_S (const u64 v64)
 
   v.v64 = v64;
 
-  return v.v32b;
+  return v.v32.b;
 }
 
 DECLSPEC u64 v64_from_v32ab_S (const u32 v32a, const u32 v32b)
 {
   vconv64_t v;
 
-  v.v32a = v32a;
-  v.v32b = v32b;
+  v.v32.a = v32a;
+  v.v32.b = v32b;
 
   return v.v64;
 }
@@ -304,26 +305,32 @@ DECLSPEC u64x hl32_to_64 (const u32x a, const u32x b)
 
 DECLSPEC u32x hc_rotl32 (const u32x a, const int n)
 {
-  #ifdef _CPU_OPENCL_EMU_H
+  #if   defined _CPU_OPENCL_EMU_H
+  return rotl32 (a, n);
+  #elif defined IS_CUDA
   return rotl32 (a, n);
   #else
-  return rotate (a, (u32x) (n));
+  return rotate (a, make_u32x (n));
   #endif
 }
 
 DECLSPEC u32x hc_rotr32 (const u32x a, const int n)
 {
-  #ifdef _CPU_OPENCL_EMU_H
+  #if   defined _CPU_OPENCL_EMU_H
+  return rotr32 (a, n);
+  #elif defined IS_CUDA
   return rotr32 (a, n);
   #else
-  return rotate (a, (u32x) (32 - n));
+  return rotate (a, make_u32x (32 - n));
   #endif
 }
 
 DECLSPEC u32 hc_rotl32_S (const u32 a, const int n)
 {
-  #ifdef _CPU_OPENCL_EMU_H
+  #if   defined _CPU_OPENCL_EMU_H
   return rotl32 (a, n);
+  #elif defined IS_CUDA
+  return rotl32_S (a, n);
   #else
   return rotate (a, (u32) (n));
   #endif
@@ -331,8 +338,10 @@ DECLSPEC u32 hc_rotl32_S (const u32 a, const int n)
 
 DECLSPEC u32 hc_rotr32_S (const u32 a, const int n)
 {
-  #ifdef _CPU_OPENCL_EMU_H
+  #if   defined _CPU_OPENCL_EMU_H
   return rotr32 (a, n);
+  #elif defined IS_CUDA
+  return rotr32_S (a, n);
   #else
   return rotate (a, (u32) (32 - n));
   #endif
@@ -340,26 +349,32 @@ DECLSPEC u32 hc_rotr32_S (const u32 a, const int n)
 
 DECLSPEC u64x hc_rotl64 (const u64x a, const int n)
 {
-  #ifdef _CPU_OPENCL_EMU_H
+  #if   defined _CPU_OPENCL_EMU_H
+  return rotl64 (a, n);
+  #elif defined IS_CUDA
   return rotl64 (a, n);
   #else
-  return rotate (a, (u64x) (n));
+  return rotate (a, make_u64x (n));
   #endif
 }
 
 DECLSPEC u64x hc_rotr64 (const u64x a, const int n)
 {
-  #ifdef _CPU_OPENCL_EMU_H
+  #if   defined _CPU_OPENCL_EMU_H
+  return rotr64 (a, n);
+  #elif defined IS_CUDA
   return rotr64 (a, n);
   #else
-  return rotate (a, (u64x) (64 - n));
+  return rotate (a, make_u64x (64 - n));
   #endif
 }
 
 DECLSPEC u64 hc_rotl64_S (const u64 a, const int n)
 {
-  #ifdef _CPU_OPENCL_EMU_H
+  #if   defined _CPU_OPENCL_EMU_H
   return rotl64 (a, n);
+  #elif defined IS_CUDA
+  return rotl64_S (a, n);
   #else
   return rotate (a, (u64) (n));
   #endif
@@ -367,8 +382,10 @@ DECLSPEC u64 hc_rotl64_S (const u64 a, const int n)
 
 DECLSPEC u64 hc_rotr64_S (const u64 a, const int n)
 {
-  #ifdef _CPU_OPENCL_EMU_H
+  #if   defined _CPU_OPENCL_EMU_H
   return rotr64 (a, n);
+  #elif defined IS_CUDA
+  return rotr64_S (a, n);
   #else
   return rotate (a, (u64) (64 - n));
   #endif
@@ -454,9 +471,9 @@ DECLSPEC u32x hc_swap32 (const u32x v)
   #endif
 
   #else
-  r = bitselect (rotate (v, (u32x) (24)),
-                 rotate (v, (u32x) ( 8)),
-                            (u32x) (0x00ff00ff));
+  r = bitselect (rotate (v, make_u32x (24)),
+                 rotate (v, make_u32x ( 8)),
+                            make_u32x (0x00ff00ff));
   #endif
   #endif
 
@@ -672,13 +689,13 @@ DECLSPEC u64x hc_swap64 (const u64x v)
   #endif
 
   #else
-  r = bitselect (bitselect (rotate (v, (u64x) (24)),
-                            rotate (v, (u64x) ( 8)),
-                                       (u64x) (0x000000ff000000ff)),
-                 bitselect (rotate (v, (u64x) (56)),
-                            rotate (v, (u64x) (40)),
-                                       (u64x) (0x00ff000000ff0000)),
-                                       (u64x) (0xffff0000ffff0000));
+  r = bitselect (bitselect (rotate (v, make_u64x (24)),
+                            rotate (v, make_u64x ( 8)),
+                                       make_u64x (0x000000ff000000ff)),
+                 bitselect (rotate (v, make_u64x (56)),
+                            rotate (v, make_u64x (40)),
+                                       make_u64x (0x00ff000000ff0000)),
+                                       make_u64x (0xffff0000ffff0000));
   #endif
   #endif
 
@@ -730,7 +747,7 @@ DECLSPEC u64 hc_swap64_S (const u64 v)
 
 DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c)
 {
-  #define BIT(x)      ((u32x) (1u) << (x))
+  #define BIT(x)      (make_u32x (1u) << (x))
   #define BIT_MASK(x) (BIT (x) - 1)
   #define BFE(x,y,z)  (((x) >> (y)) & BIT_MASK (z))
 
@@ -1164,7 +1181,7 @@ DECLSPEC u32 hc_lop_0x96_S (const u32 a, const u32 b, const u32 c)
 
 DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c)
 {
-  #define BIT(x)      ((u32x) (1u) << (x))
+  #define BIT(x)      (make_u32x (1u) << (x))
   #define BIT_MASK(x) (BIT (x) - 1)
   #define BFE(x,y,z)  (((x) >> (y)) & BIT_MASK (z))
 
@@ -1415,8 +1432,8 @@ DECLSPEC int is_valid_hex_8 (const u8 v)
 {
   // direct lookup table is slower thanks to CMOV
 
-  if ((v >= '0') && (v <= '9')) return 1;
-  if ((v >= 'a') && (v <= 'f')) return 1;
+  if ((v >= (u8) '0') && (v <= (u8) '9')) return 1;
+  if ((v >= (u8) 'a') && (v <= (u8) 'f')) return 1;
 
   return 0;
 }
@@ -1433,10 +1450,10 @@ DECLSPEC int is_valid_hex_32 (const u32 v)
 
 DECLSPEC int is_valid_base58_8 (const u8 v)
 {
-  if (v > 'z') return 0;
-  if (v < '1') return 0;
-  if ((v > '9') && (v < 'A')) return 0;
-  if ((v > 'Z') && (v < 'a')) return 0;
+  if (v > (u8) 'z') return 0;
+  if (v < (u8) '1') return 0;
+  if ((v > (u8) '9') && (v < (u8) 'A')) return 0;
+  if ((v > (u8) 'Z') && (v < (u8) 'a')) return 0;
 
   return 1;
 }
@@ -60860,7 +60877,23 @@ KERNEL_FQ void gpu_memset (GLOBAL_AS uint4 *buf, const u32 value, const u64 gid_
 
   if (gid >= gid_max) return;
 
-  buf[gid] = (uint4) (value);
+  uint4 r;
+
+  #if   defined IS_NATIVE
+  r = value;
+  #elif defined IS_OPENCL
+  r.s0 = value;
+  r.s1 = value;
+  r.s2 = value;
+  r.s3 = value;
+  #elif defined IS_CUDA
+  r.x = value;
+  r.y = value;
+  r.z = value;
+  r.w = value;
+  #endif
+
+  buf[gid] = r;
 }
 
 KERNEL_FQ void gpu_atinit (GLOBAL_AS pw_t *buf, const u64 gid_max)
diff --git a/OpenCL/inc_common.h b/OpenCL/inc_common.h
index bdcb16d38..58a6bbeb5 100644
--- a/OpenCL/inc_common.h
+++ b/OpenCL/inc_common.h
@@ -26,6 +26,44 @@
  *   - P19: Type of the esalt_bufs structure with additional data, or void.
  */
 
+#ifdef IS_CUDA
+#define KERN_ATTR(p2,p4,p5,p6,p19)                              \
+  MAYBE_UNUSED GLOBAL_AS       pw_t          *pws,              \
+  MAYBE_UNUSED p2        const kernel_rule_t *g_rules_buf,      \
+  MAYBE_UNUSED GLOBAL_AS const pw_t          *combs_buf,        \
+  MAYBE_UNUSED p4,                                              \
+  MAYBE_UNUSED GLOBAL_AS p5                  *tmps,             \
+  MAYBE_UNUSED GLOBAL_AS p6                  *hooks,            \
+  MAYBE_UNUSED GLOBAL_AS const u32           *bitmaps_buf_s1_a, \
+  MAYBE_UNUSED GLOBAL_AS const u32           *bitmaps_buf_s1_b, \
+  MAYBE_UNUSED GLOBAL_AS const u32           *bitmaps_buf_s1_c, \
+  MAYBE_UNUSED GLOBAL_AS const u32           *bitmaps_buf_s1_d, \
+  MAYBE_UNUSED GLOBAL_AS const u32           *bitmaps_buf_s2_a, \
+  MAYBE_UNUSED GLOBAL_AS const u32           *bitmaps_buf_s2_b, \
+  MAYBE_UNUSED GLOBAL_AS const u32           *bitmaps_buf_s2_c, \
+  MAYBE_UNUSED GLOBAL_AS const u32           *bitmaps_buf_s2_d, \
+  MAYBE_UNUSED GLOBAL_AS       plain_t       *plains_buf,       \
+  MAYBE_UNUSED GLOBAL_AS const digest_t      *digests_buf,      \
+  MAYBE_UNUSED GLOBAL_AS       u32           *hashes_shown,     \
+  MAYBE_UNUSED GLOBAL_AS const salt_t        *salt_bufs,        \
+  MAYBE_UNUSED GLOBAL_AS const p19           *esalt_bufs,       \
+  MAYBE_UNUSED GLOBAL_AS       u32           *d_return_buf,     \
+  MAYBE_UNUSED GLOBAL_AS       void          *d_extra0_buf,     \
+  MAYBE_UNUSED GLOBAL_AS       void          *d_extra1_buf,     \
+  MAYBE_UNUSED GLOBAL_AS       void          *d_extra2_buf,     \
+  MAYBE_UNUSED GLOBAL_AS       void          *d_extra3_buf,     \
+  MAYBE_UNUSED           const u32            bitmap_mask,      \
+  MAYBE_UNUSED           const u32            bitmap_shift1,    \
+  MAYBE_UNUSED           const u32            bitmap_shift2,    \
+  MAYBE_UNUSED           const u32            salt_pos,         \
+  MAYBE_UNUSED           const u32            loop_pos,         \
+  MAYBE_UNUSED           const u32            loop_cnt,         \
+  MAYBE_UNUSED           const u32            il_cnt,           \
+  MAYBE_UNUSED           const u32            digests_cnt,      \
+  MAYBE_UNUSED           const u32            digests_offset,   \
+  MAYBE_UNUSED           const u32            combs_mode,       \
+  MAYBE_UNUSED           const u64            gid_max
+#else
 #define KERN_ATTR(p2,p4,p5,p6,p19)                              \
   MAYBE_UNUSED GLOBAL_AS       pw_t          *pws,              \
   MAYBE_UNUSED p2        const kernel_rule_t *rules_buf,        \
@@ -62,7 +100,7 @@
   MAYBE_UNUSED           const u32            digests_offset,   \
   MAYBE_UNUSED           const u32            combs_mode,       \
   MAYBE_UNUSED           const u64            gid_max
-
+#endif
 /*
  * Shortcut macros for usage in the actual kernels
  *
@@ -71,16 +109,29 @@
  * do not use rules or tmps, etc.
  */
 
-#define KERN_ATTR_BASIC()         KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *bfs_buf,     void, void, void)
-#define KERN_ATTR_BITSLICE()      KERN_ATTR (GLOBAL_AS,   CONSTANT_AS const bs_word_t *words_buf_r, void, void, void)
-#define KERN_ATTR_ESALT(e)        KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *bfs_buf,     void, void, e)
-#define KERN_ATTR_RULES()         KERN_ATTR (CONSTANT_AS, GLOBAL_AS   const bf_t      *bfs_buf,     void, void, void)
-#define KERN_ATTR_RULES_ESALT(e)  KERN_ATTR (CONSTANT_AS, GLOBAL_AS   const bf_t      *bfs_buf,     void, void, e)
-#define KERN_ATTR_TMPS(t)         KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *bfs_buf,     t,    void, void)
-#define KERN_ATTR_TMPS_ESALT(t,e) KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *bfs_buf,     t,    void, e)
-#define KERN_ATTR_TMPS_HOOKS(t,h) KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *bfs_buf,     t,    h,    void)
-#define KERN_ATTR_VECTOR()        KERN_ATTR (GLOBAL_AS,   CONSTANT_AS const u32x      *words_buf_r, void, void, void)
-#define KERN_ATTR_VECTOR_ESALT(e) KERN_ATTR (GLOBAL_AS,   CONSTANT_AS const u32x      *words_buf_r, void, void, e)
+#ifdef IS_CUDA
+#define KERN_ATTR_BASIC()         KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *g_bfs_buf,     void, void, void)
+#define KERN_ATTR_BITSLICE()      KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bs_word_t *g_words_buf_s, void, void, void)
+#define KERN_ATTR_ESALT(e)        KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *g_bfs_buf,     void, void, e)
+#define KERN_ATTR_RULES()         KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *g_bfs_buf,     void, void, void)
+#define KERN_ATTR_RULES_ESALT(e)  KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *g_bfs_buf,     void, void, e)
+#define KERN_ATTR_TMPS(t)         KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *g_bfs_buf,     t,    void, void)
+#define KERN_ATTR_TMPS_ESALT(t,e) KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *g_bfs_buf,     t,    void, e)
+#define KERN_ATTR_TMPS_HOOKS(t,h) KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *g_bfs_buf,     t,    h,    void)
+#define KERN_ATTR_VECTOR()        KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const u32x      *g_words_buf_r, void, void, void)
+#define KERN_ATTR_VECTOR_ESALT(e) KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const u32x      *g_words_buf_r, void, void, e)
+#else
+#define KERN_ATTR_BASIC()         KERN_ATTR (GLOBAL_AS,   CONSTANT_AS const bf_t      *bfs_buf,       void, void, void)
+#define KERN_ATTR_BITSLICE()      KERN_ATTR (GLOBAL_AS,   CONSTANT_AS const bs_word_t *words_buf_s,   void, void, void)
+#define KERN_ATTR_ESALT(e)        KERN_ATTR (GLOBAL_AS,   CONSTANT_AS const bf_t      *bfs_buf,       void, void, e)
+#define KERN_ATTR_RULES()         KERN_ATTR (CONSTANT_AS, GLOBAL_AS   const bf_t      *bfs_buf,       void, void, void)
+#define KERN_ATTR_RULES_ESALT(e)  KERN_ATTR (CONSTANT_AS, GLOBAL_AS   const bf_t      *bfs_buf,       void, void, e)
+#define KERN_ATTR_TMPS(t)         KERN_ATTR (GLOBAL_AS,   CONSTANT_AS const bf_t      *bfs_buf,       t,    void, void)
+#define KERN_ATTR_TMPS_ESALT(t,e) KERN_ATTR (GLOBAL_AS,   CONSTANT_AS const bf_t      *bfs_buf,       t,    void, e)
+#define KERN_ATTR_TMPS_HOOKS(t,h) KERN_ATTR (GLOBAL_AS,   CONSTANT_AS const bf_t      *bfs_buf,       t,    h,    void)
+#define KERN_ATTR_VECTOR()        KERN_ATTR (GLOBAL_AS,   CONSTANT_AS const u32x      *words_buf_r,   void, void, void)
+#define KERN_ATTR_VECTOR_ESALT(e) KERN_ATTR (GLOBAL_AS,   CONSTANT_AS const u32x      *words_buf_r,   void, void, e)
+#endif
 
 // union based packing
 
diff --git a/OpenCL/inc_diskcryptor_xts.cl b/OpenCL/inc_diskcryptor_xts.cl
index 2abfdd21d..e643e879f 100644
--- a/OpenCL/inc_diskcryptor_xts.cl
+++ b/OpenCL/inc_diskcryptor_xts.cl
@@ -1,3 +1,17 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "inc_vendor.h"
+#include "inc_types.h"
+#include "inc_platform.h"
+#include "inc_common.h"
+#include "inc_cipher_aes.h"
+#include "inc_cipher_serpent.h"
+#include "inc_cipher_twofish.h"
+#include "inc_diskcryptor_xts.h"
+
 DECLSPEC void dcrp_xts_mul2 (u32 *in, u32 *out)
 {
   const u32 c = in[3] >> 31;
diff --git a/OpenCL/inc_hash_md4.cl b/OpenCL/inc_hash_md4.cl
index 3b9113907..28720b25c 100644
--- a/OpenCL/inc_hash_md4.cl
+++ b/OpenCL/inc_hash_md4.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_md4.h"
 
diff --git a/OpenCL/inc_hash_md5.cl b/OpenCL/inc_hash_md5.cl
index 66c58d282..2fee96f96 100644
--- a/OpenCL/inc_hash_md5.cl
+++ b/OpenCL/inc_hash_md5.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_md5.h"
 
diff --git a/OpenCL/inc_hash_ripemd160.cl b/OpenCL/inc_hash_ripemd160.cl
index 73715e9ea..703f0dcc8 100644
--- a/OpenCL/inc_hash_ripemd160.cl
+++ b/OpenCL/inc_hash_ripemd160.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_ripemd160.h"
 
diff --git a/OpenCL/inc_hash_sha1.cl b/OpenCL/inc_hash_sha1.cl
index dcc6ceb04..6ec45c6ba 100644
--- a/OpenCL/inc_hash_sha1.cl
+++ b/OpenCL/inc_hash_sha1.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_sha1.h"
 
diff --git a/OpenCL/inc_hash_sha224.cl b/OpenCL/inc_hash_sha224.cl
index ed4f81b0f..e93206e44 100644
--- a/OpenCL/inc_hash_sha224.cl
+++ b/OpenCL/inc_hash_sha224.cl
@@ -5,10 +5,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_sha224.h"
 
-CONSTANT_AS u32a k_sha224[64] =
+CONSTANT_VK u32a k_sha224[64] =
 {
   SHA224C00, SHA224C01, SHA224C02, SHA224C03,
   SHA224C04, SHA224C05, SHA224C06, SHA224C07,
diff --git a/OpenCL/inc_hash_sha256.cl b/OpenCL/inc_hash_sha256.cl
index 464984b15..de2bd5897 100644
--- a/OpenCL/inc_hash_sha256.cl
+++ b/OpenCL/inc_hash_sha256.cl
@@ -5,10 +5,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_sha256.h"
 
-CONSTANT_AS u32a k_sha256[64] =
+CONSTANT_VK u32a k_sha256[64] =
 {
   SHA256C00, SHA256C01, SHA256C02, SHA256C03,
   SHA256C04, SHA256C05, SHA256C06, SHA256C07,
diff --git a/OpenCL/inc_hash_sha384.cl b/OpenCL/inc_hash_sha384.cl
index 8817ab772..ea26ec734 100644
--- a/OpenCL/inc_hash_sha384.cl
+++ b/OpenCL/inc_hash_sha384.cl
@@ -5,10 +5,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_sha384.h"
 
-CONSTANT_AS u64a k_sha384[80] =
+CONSTANT_VK u64a k_sha384[80] =
 {
   SHA512C00, SHA512C01, SHA512C02, SHA512C03,
   SHA512C04, SHA512C05, SHA512C06, SHA512C07,
diff --git a/OpenCL/inc_hash_sha512.cl b/OpenCL/inc_hash_sha512.cl
index 02e2a41df..783a66fbe 100644
--- a/OpenCL/inc_hash_sha512.cl
+++ b/OpenCL/inc_hash_sha512.cl
@@ -5,10 +5,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_sha512.h"
 
-CONSTANT_AS u64a k_sha512[80] =
+CONSTANT_VK u64a k_sha512[80] =
 {
   SHA512C00, SHA512C01, SHA512C02, SHA512C03,
   SHA512C04, SHA512C05, SHA512C06, SHA512C07,
diff --git a/OpenCL/inc_hash_streebog256.cl b/OpenCL/inc_hash_streebog256.cl
index e14ad4007..92534db1d 100644
--- a/OpenCL/inc_hash_streebog256.cl
+++ b/OpenCL/inc_hash_streebog256.cl
@@ -5,10 +5,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_streebog256.h"
 
-CONSTANT_AS u64a sbob256_sl64[8][256] =
+CONSTANT_VK u64a sbob256_sl64[8][256] =
 {
   {
     0xd031c397ce553fe6, 0x16ba5b01b006b525, 0xa89bade6296e70c8, 0x6a1f525d77d3435b,
@@ -540,7 +541,7 @@ CONSTANT_AS u64a sbob256_sl64[8][256] =
   },
 };
 
-CONSTANT_AS u64a sbob256_rc64[12][8] =
+CONSTANT_VK u64a sbob256_rc64[12][8] =
 {
   {
     0xe9daca1eda5b08b1, 0x1f7c65c0812fcbeb, 0x16d0452e43766a2f, 0xfcc485758db84e71,
@@ -1362,7 +1363,7 @@ DECLSPEC void streebog256_add_vector (u64x *x, const u64x *y)
     const u64x right = hc_swap64 (y[i]);
     const u64x sum   = left + right + carry;
 
-    carry = (sum < left) ? (u64x) 1 : (u64x) 0;
+    carry = (sum < left) ? make_u64x (1) : make_u64x (0);
 
     x[i] = hc_swap64 (sum);
   }
@@ -1710,7 +1711,7 @@ DECLSPEC void streebog256_final_vector (streebog256_ctx_vector_t *ctx)
   streebog256_g_vector (ctx->h, ctx->n, m, ctx->s_sbob_sl64);
 
   u64x sizebuf[8] = { 0 };
-  sizebuf[7] = hc_swap64 ((u64x) (pos << 3));
+  sizebuf[7] = hc_swap64 (make_u64x (pos << 3));
 
   streebog256_add_vector (ctx->n, sizebuf);
 
diff --git a/OpenCL/inc_hash_streebog256.h b/OpenCL/inc_hash_streebog256.h
index 1e593c3f2..71e87cf96 100644
--- a/OpenCL/inc_hash_streebog256.h
+++ b/OpenCL/inc_hash_streebog256.h
@@ -10,17 +10,17 @@
 #define BOX(S,n,i)        ((S)[(n)][(i)])
 
 #elif VECT_SIZE == 2
-#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(S,n,i) make_u64x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 
 #elif VECT_SIZE == 4
-#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(S,n,i) make_u64x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 
 #elif VECT_SIZE == 8
-#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], \
+#define BOX(S,n,i) make_u64x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], \
                            (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 
 #elif VECT_SIZE == 16
-#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], \
+#define BOX(S,n,i) make_u64x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], \
                            (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], \
                            (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], \
                            (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
diff --git a/OpenCL/inc_hash_streebog512.cl b/OpenCL/inc_hash_streebog512.cl
index e32c4169d..08580981d 100644
--- a/OpenCL/inc_hash_streebog512.cl
+++ b/OpenCL/inc_hash_streebog512.cl
@@ -5,10 +5,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_streebog512.h"
 
-CONSTANT_AS u64a sbob512_sl64[8][256] =
+CONSTANT_VK u64a sbob512_sl64[8][256] =
 {
   {
     0xd031c397ce553fe6, 0x16ba5b01b006b525, 0xa89bade6296e70c8, 0x6a1f525d77d3435b,
@@ -540,7 +541,7 @@ CONSTANT_AS u64a sbob512_sl64[8][256] =
   },
 };
 
-CONSTANT_AS u64a sbob512_rc64[12][8] =
+CONSTANT_VK u64a sbob512_rc64[12][8] =
 {
   {
     0xe9daca1eda5b08b1, 0x1f7c65c0812fcbeb, 0x16d0452e43766a2f, 0xfcc485758db84e71,
@@ -1381,7 +1382,7 @@ DECLSPEC void streebog512_add_vector (u64x *x, const u64x *y)
     const u64x right = hc_swap64 (y[i]);
     const u64x sum   = left + right + carry;
 
-    carry = (sum < left) ? (u64x) 1 : (u64x) 0;
+    carry = (sum < left) ? make_u64x (1) : make_u64x (0);
 
     x[i] = hc_swap64 (sum);
   }
@@ -1729,7 +1730,7 @@ DECLSPEC void streebog512_final_vector (streebog512_ctx_vector_t *ctx)
   streebog512_g_vector (ctx->h, ctx->n, m, ctx->s_sbob_sl64);
 
   u64x sizebuf[8] = { 0 };
-  sizebuf[7] = hc_swap64 ((u64x) (pos << 3));
+  sizebuf[7] = hc_swap64 (make_u64x (pos << 3));
 
   streebog512_add_vector (ctx->n, sizebuf);
 
diff --git a/OpenCL/inc_hash_streebog512.h b/OpenCL/inc_hash_streebog512.h
index a11644ca2..4181674e8 100644
--- a/OpenCL/inc_hash_streebog512.h
+++ b/OpenCL/inc_hash_streebog512.h
@@ -10,17 +10,17 @@
 #define BOX(S,n,i)        ((S)[(n)][(i)])
 
 #elif VECT_SIZE == 2
-#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(S,n,i) make_u64x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 
 #elif VECT_SIZE == 4
-#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(S,n,i) make_u64x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 
 #elif VECT_SIZE == 8
-#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], \
+#define BOX(S,n,i) make_u64x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], \
                            (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 
 #elif VECT_SIZE == 16
-#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], \
+#define BOX(S,n,i) make_u64x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], \
                            (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], \
                            (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], \
                            (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
diff --git a/OpenCL/inc_hash_whirlpool.cl b/OpenCL/inc_hash_whirlpool.cl
index 0b61a7b9d..b4933e6f7 100644
--- a/OpenCL/inc_hash_whirlpool.cl
+++ b/OpenCL/inc_hash_whirlpool.cl
@@ -5,10 +5,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_whirlpool.h"
 
-CONSTANT_AS u32a Ch[8][256] =
+CONSTANT_VK u32a Ch[8][256] =
 {
   {
     0x18186018, 0x23238c23, 0xc6c63fc6, 0xe8e887e8,
@@ -540,7 +541,7 @@ CONSTANT_AS u32a Ch[8][256] =
   }
 };
 
-CONSTANT_AS u32a Cl[8][256] =
+CONSTANT_VK u32a Cl[8][256] =
 {
   {
     0xc07830d8, 0x05af4626, 0x7ef991b8, 0x136fcdfb,
@@ -1072,7 +1073,7 @@ CONSTANT_AS u32a Cl[8][256] =
   },
 };
 
-CONSTANT_AS u32a rch[R + 1] =
+CONSTANT_VK u32a rch[R + 1] =
 {
   0x00000000,
   0x1823c6e8,
@@ -1087,7 +1088,7 @@ CONSTANT_AS u32a rch[R + 1] =
   0xca2dbf07,
 };
 
-CONSTANT_AS u32a rcl[R + 1] =
+CONSTANT_VK u32a rcl[R + 1] =
 {
   0x00000000,
   0x87b8014f,
diff --git a/OpenCL/inc_hash_whirlpool.h b/OpenCL/inc_hash_whirlpool.h
index 5f25fbdc4..882bdd75b 100644
--- a/OpenCL/inc_hash_whirlpool.h
+++ b/OpenCL/inc_hash_whirlpool.h
@@ -11,13 +11,13 @@
 #if   VECT_SIZE == 1
 #define BOX(S,n,i) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(S,n,i) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(S,n,i) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(S,n,i) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(S,n,i) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #define BOX_S(S,n,i) (S)[(n)][(i)]
diff --git a/OpenCL/inc_luks_aes.cl b/OpenCL/inc_luks_aes.cl
index 6306edd94..8406ec1eb 100644
--- a/OpenCL/inc_luks_aes.cl
+++ b/OpenCL/inc_luks_aes.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_cipher_aes.h"
 #include "inc_luks_af.h"
diff --git a/OpenCL/inc_luks_af.cl b/OpenCL/inc_luks_af.cl
index 058143ca2..53a03e8cf 100644
--- a/OpenCL/inc_luks_af.cl
+++ b/OpenCL/inc_luks_af.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_ripemd160.h"
 #include "inc_hash_sha1.h"
diff --git a/OpenCL/inc_luks_essiv.cl b/OpenCL/inc_luks_essiv.cl
index 4465617ab..151284bd0 100644
--- a/OpenCL/inc_luks_essiv.cl
+++ b/OpenCL/inc_luks_essiv.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_hash_sha256.h"
 #include "inc_luks_essiv.h"
diff --git a/OpenCL/inc_luks_serpent.cl b/OpenCL/inc_luks_serpent.cl
index d4b1cf96c..1f75fc6a7 100644
--- a/OpenCL/inc_luks_serpent.cl
+++ b/OpenCL/inc_luks_serpent.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_luks_serpent.h"
 
diff --git a/OpenCL/inc_luks_twofish.cl b/OpenCL/inc_luks_twofish.cl
index 9c496f2cf..2d2d7d34c 100644
--- a/OpenCL/inc_luks_twofish.cl
+++ b/OpenCL/inc_luks_twofish.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_luks_twofish.h"
 
diff --git a/OpenCL/inc_luks_xts.cl b/OpenCL/inc_luks_xts.cl
index 5bd6e6410..a3b87abcc 100644
--- a/OpenCL/inc_luks_xts.cl
+++ b/OpenCL/inc_luks_xts.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_luks_xts.h"
 
diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl
new file mode 100644
index 000000000..16761cc27
--- /dev/null
+++ b/OpenCL/inc_platform.cl
@@ -0,0 +1,115 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "inc_vendor.h"
+#include "inc_types.h"
+#include "inc_platform.h"
+
+#ifdef IS_NATIVE
+#define SYNC_THREADS()
+#endif
+
+#ifdef IS_CUDA
+
+#if ATTACK_EXEC == 11
+
+CONSTANT_VK u32 generic_constant[8192]; // 32k
+
+#if   ATTACK_KERN == 0
+#define bfs_buf     g_bfs_buf
+#define rules_buf   ((const kernel_rule_t *) generic_constant)
+#define words_buf_s g_words_buf_s
+#define words_buf_r g_words_buf_r
+#elif ATTACK_KERN == 1
+#define bfs_buf     g_bfs_buf
+#define rules_buf   g_rules_buf
+#define words_buf_s g_words_buf_s
+#define words_buf_r g_words_buf_r
+#elif ATTACK_KERN == 3
+#define rules_buf   g_rules_buf
+#define bfs_buf     ((const bf_t *)      generic_constant)
+#define words_buf_s ((const bs_word_t *) generic_constant)
+#define words_buf_r ((const u32x *)      generic_constant)
+#endif
+
+#endif
+
+DECLSPEC u32 atomic_dec (u32 *p)
+{
+  return atomicSub (p, 1);
+}
+
+DECLSPEC u32 atomic_inc (u32 *p)
+{
+  return atomicAdd (p, 1);
+}
+
+DECLSPEC u32 atomic_or (u32 *p, u32 val)
+{
+  return atomicOr (p, val);
+}
+
+DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
+{
+  return (blockIdx.x * blockDim.x) + threadIdx.x;
+}
+
+DECLSPEC size_t get_local_id (const u32 dimindx __attribute__((unused)))
+{
+  return threadIdx.x;
+}
+
+DECLSPEC size_t get_local_size (const u32 dimindx __attribute__((unused)))
+{
+  // verify
+  return blockDim.x;
+}
+
+DECLSPEC u32x rotl32 (const u32x a, const int n)
+{
+  return ((a << n) | ((a >> (32 - n))));
+}
+
+DECLSPEC u32x rotr32 (const u32x a, const int n)
+{
+  return ((a >> n) | ((a << (32 - n))));
+}
+
+DECLSPEC u32 rotl32_S (const u32 a, const int n)
+{
+  return ((a << n) | ((a >> (32 - n))));
+}
+
+DECLSPEC u32 rotr32_S (const u32 a, const int n)
+{
+  return ((a >> n) | ((a << (32 - n))));
+}
+
+DECLSPEC u64x rotl64 (const u64x a, const int n)
+{
+  return ((a << n) | ((a >> (64 - n))));
+}
+
+DECLSPEC u64x rotr64 (const u64x a, const int n)
+{
+  return ((a >> n) | ((a << (64 - n))));
+}
+
+DECLSPEC u64 rotl64_S (const u64 a, const int n)
+{
+  return ((a << n) | ((a >> (64 - n))));
+}
+
+DECLSPEC u64 rotr64_S (const u64 a, const int n)
+{
+  return ((a >> n) | ((a << (64 - n))));
+}
+
+#define SYNC_THREADS() __syncthreads ()
+#endif
+
+#ifdef IS_OPENCL
+#define SYNC_THREADS() barrier (CLK_LOCAL_MEM_FENCE)
+#endif
diff --git a/OpenCL/inc_platform.h b/OpenCL/inc_platform.h
new file mode 100644
index 000000000..30069cda5
--- /dev/null
+++ b/OpenCL/inc_platform.h
@@ -0,0 +1,30 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#ifndef _INC_PLATFORM_H
+#define _INC_PLATFORM_H
+
+#ifdef IS_CUDA
+DECLSPEC u32    atomic_dec      (u32 *p);
+DECLSPEC u32    atomic_inc      (u32 *p);
+DECLSPEC u32    atomic_or       (u32 *p, u32 val);
+DECLSPEC size_t get_global_id   (const u32 dimindx __attribute__((unused)));
+DECLSPEC size_t get_local_id    (const u32 dimindx __attribute__((unused)));
+DECLSPEC size_t get_local_size  (const u32 dimindx __attribute__((unused)));
+
+DECLSPEC u32x rotl32   (const u32x a, const int n);
+DECLSPEC u32x rotr32   (const u32x a, const int n);
+DECLSPEC u32  rotl32_S (const u32  a, const int n);
+DECLSPEC u32  rotr32_S (const u32  a, const int n);
+DECLSPEC u64x rotl64   (const u64x a, const int n);
+DECLSPEC u64x rotr64   (const u64x a, const int n);
+DECLSPEC u64  rotl64_S (const u64  a, const int n);
+DECLSPEC u64  rotr64_S (const u64  a, const int n);
+
+//#define rotate(a,n) (((a) << (n)) | ((a) >> (32 - (n))))
+#define bitselect(a,b,c) ((a) ^ ((c) & ((b) ^ (a))))
+#endif
+
+#endif // _INC_PLATFORM_H
diff --git a/OpenCL/inc_rp.cl b/OpenCL/inc_rp.cl
index ffa84f4ab..d67127317 100644
--- a/OpenCL/inc_rp.cl
+++ b/OpenCL/inc_rp.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_rp.h"
 
@@ -14,7 +15,7 @@
 
 #ifdef REAL_SHM
 #define COPY_PW(x)                \
-  LOCAL_AS pw_t s_pws[64];         \
+  LOCAL_VK pw_t s_pws[64];         \
   s_pws[get_local_id (0)] = (x);
 #else
 #define COPY_PW(x)                \
diff --git a/OpenCL/inc_rp_optimized.cl b/OpenCL/inc_rp_optimized.cl
index 53a5d4d38..6a21bd688 100644
--- a/OpenCL/inc_rp_optimized.cl
+++ b/OpenCL/inc_rp_optimized.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_rp_optimized.h"
 
@@ -2354,7 +2355,7 @@ DECLSPEC u32 apply_rules_optimized (CONSTANT_AS const u32 *cmds, u32 *buf0, u32
   return out_len;
 }
 
-DECLSPEC u32x apply_rules_vect_optimized (const u32 *pw_buf0, const u32 *pw_buf1, const u32 pw_len, CONSTANT_AS const kernel_rule_t *rules_buf, const u32 il_pos, u32x *buf0, u32x *buf1)
+DECLSPEC u32x apply_rules_vect_optimized (const u32 *pw_buf0, const u32 *pw_buf1, const u32 pw_len, CONSTANT_AS const kernel_rule_t *kernel_rules, const u32 il_pos, u32x *buf0, u32x *buf1)
 {
   #if VECT_SIZE == 1
 
@@ -2367,7 +2368,7 @@ DECLSPEC u32x apply_rules_vect_optimized (const u32 *pw_buf0, const u32 *pw_buf1
   buf1[2] = pw_buf1[2];
   buf1[3] = pw_buf1[3];
 
-  return apply_rules_optimized (rules_buf[il_pos].cmds, buf0, buf1, pw_len);
+  return apply_rules_optimized (kernel_rules[il_pos].cmds, buf0, buf1, pw_len);
 
   #else
 
@@ -2390,7 +2391,7 @@ DECLSPEC u32x apply_rules_vect_optimized (const u32 *pw_buf0, const u32 *pw_buf1
     tmp1[2] = pw_buf1[2];
     tmp1[3] = pw_buf1[3];
 
-    const u32 tmp_len = apply_rules_optimized (rules_buf[il_pos + i].cmds, tmp0, tmp1, pw_len);
+    const u32 tmp_len = apply_rules_optimized (kernel_rules[il_pos + i].cmds, tmp0, tmp1, pw_len);
 
     switch (i)
     {
diff --git a/OpenCL/inc_rp_optimized.h b/OpenCL/inc_rp_optimized.h
index c52113da0..b6a133086 100644
--- a/OpenCL/inc_rp_optimized.h
+++ b/OpenCL/inc_rp_optimized.h
@@ -123,6 +123,6 @@ DECLSPEC u32 toggle_on_register (const u32 in, const u32 r);
 DECLSPEC u32 rule_op_mangle_title_sep (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len);
 DECLSPEC u32 apply_rule_optimized (const u32 name, const u32 p0, const u32 p1, u32 *buf0, u32 *buf1, const u32 in_len);
 DECLSPEC u32 apply_rules_optimized (CONSTANT_AS const u32 *cmds, u32 *buf0, u32 *buf1, const u32 len);
-DECLSPEC u32x apply_rules_vect_optimized (const u32 *pw_buf0, const u32 *pw_buf1, const u32 pw_len, CONSTANT_AS const kernel_rule_t *rules_buf, const u32 il_pos, u32x *buf0, u32x *buf1);
+DECLSPEC u32x apply_rules_vect_optimized (const u32 *pw_buf0, const u32 *pw_buf1, const u32 pw_len, CONSTANT_AS const kernel_rule_t *kernel_rules, const u32 il_pos, u32x *buf0, u32x *buf1);
 
 #endif // _INC_RP_OPTIMIZED_H
diff --git a/OpenCL/inc_simd.cl b/OpenCL/inc_simd.cl
index 26f413979..7bedf35cb 100644
--- a/OpenCL/inc_simd.cl
+++ b/OpenCL/inc_simd.cl
@@ -5,23 +5,24 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_simd.h"
 
 // attack-mode 0
 
-DECLSPEC u32x ix_create_bft (GLOBAL_AS const bf_t *bfs_buf, const u32 il_pos)
+DECLSPEC u32x ix_create_bft (CONSTANT_AS const bf_t *arr, const u32 il_pos)
 {
   #if   VECT_SIZE == 1
-  const u32x ix = (u32x) (bfs_buf[il_pos + 0].i);
+  const u32x ix = make_u32x (arr[il_pos + 0].i);
   #elif VECT_SIZE == 2
-  const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i);
+  const u32x ix = make_u32x (arr[il_pos + 0].i, arr[il_pos + 1].i);
   #elif VECT_SIZE == 4
-  const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i);
+  const u32x ix = make_u32x (arr[il_pos + 0].i, arr[il_pos + 1].i, arr[il_pos + 2].i, arr[il_pos + 3].i);
   #elif VECT_SIZE == 8
-  const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i);
+  const u32x ix = make_u32x (arr[il_pos + 0].i, arr[il_pos + 1].i, arr[il_pos + 2].i, arr[il_pos + 3].i, arr[il_pos + 4].i, arr[il_pos + 5].i, arr[il_pos + 6].i, arr[il_pos + 7].i);
   #elif VECT_SIZE == 16
-  const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i, bfs_buf[il_pos + 8].i, bfs_buf[il_pos + 9].i, bfs_buf[il_pos + 10].i, bfs_buf[il_pos + 11].i, bfs_buf[il_pos + 12].i, bfs_buf[il_pos + 13].i, bfs_buf[il_pos + 14].i, bfs_buf[il_pos + 15].i);
+  const u32x ix = make_u32x (arr[il_pos + 0].i, arr[il_pos + 1].i, arr[il_pos + 2].i, arr[il_pos + 3].i, arr[il_pos + 4].i, arr[il_pos + 5].i, arr[il_pos + 6].i, arr[il_pos + 7].i, arr[il_pos + 8].i, arr[il_pos + 9].i, arr[il_pos + 10].i, arr[il_pos + 11].i, arr[il_pos + 12].i, arr[il_pos + 13].i, arr[il_pos + 14].i, arr[il_pos + 15].i);
   #endif
 
   return ix;
@@ -29,35 +30,35 @@ DECLSPEC u32x ix_create_bft (GLOBAL_AS const bf_t *bfs_buf, const u32 il_pos)
 
 // attack-mode 1
 
-DECLSPEC u32x pwlenx_create_combt (GLOBAL_AS const pw_t *combs_buf, const u32 il_pos)
+DECLSPEC u32x pwlenx_create_combt (GLOBAL_AS const pw_t *arr, const u32 il_pos)
 {
   #if   VECT_SIZE == 1
-  const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len);
+  const u32x pw_lenx = make_u32x (arr[il_pos + 0].pw_len);
   #elif VECT_SIZE == 2
-  const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len);
+  const u32x pw_lenx = make_u32x (arr[il_pos + 0].pw_len, arr[il_pos + 1].pw_len);
   #elif VECT_SIZE == 4
-  const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len, combs_buf[il_pos + 2].pw_len, combs_buf[il_pos + 3].pw_len);
+  const u32x pw_lenx = make_u32x (arr[il_pos + 0].pw_len, arr[il_pos + 1].pw_len, arr[il_pos + 2].pw_len, arr[il_pos + 3].pw_len);
   #elif VECT_SIZE == 8
-  const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len, combs_buf[il_pos + 2].pw_len, combs_buf[il_pos + 3].pw_len, combs_buf[il_pos + 4].pw_len, combs_buf[il_pos + 5].pw_len, combs_buf[il_pos + 6].pw_len, combs_buf[il_pos + 7].pw_len);
+  const u32x pw_lenx = make_u32x (arr[il_pos + 0].pw_len, arr[il_pos + 1].pw_len, arr[il_pos + 2].pw_len, arr[il_pos + 3].pw_len, arr[il_pos + 4].pw_len, arr[il_pos + 5].pw_len, arr[il_pos + 6].pw_len, arr[il_pos + 7].pw_len);
   #elif VECT_SIZE == 16
-  const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len, combs_buf[il_pos + 2].pw_len, combs_buf[il_pos + 3].pw_len, combs_buf[il_pos + 4].pw_len, combs_buf[il_pos + 5].pw_len, combs_buf[il_pos + 6].pw_len, combs_buf[il_pos + 7].pw_len, combs_buf[il_pos + 8].pw_len, combs_buf[il_pos + 9].pw_len, combs_buf[il_pos + 10].pw_len, combs_buf[il_pos + 11].pw_len, combs_buf[il_pos + 12].pw_len, combs_buf[il_pos + 13].pw_len, combs_buf[il_pos + 14].pw_len, combs_buf[il_pos + 15].pw_len);
+  const u32x pw_lenx = make_u32x (arr[il_pos + 0].pw_len, arr[il_pos + 1].pw_len, arr[il_pos + 2].pw_len, arr[il_pos + 3].pw_len, arr[il_pos + 4].pw_len, arr[il_pos + 5].pw_len, arr[il_pos + 6].pw_len, arr[il_pos + 7].pw_len, arr[il_pos + 8].pw_len, arr[il_pos + 9].pw_len, arr[il_pos + 10].pw_len, arr[il_pos + 11].pw_len, arr[il_pos + 12].pw_len, arr[il_pos + 13].pw_len, arr[il_pos + 14].pw_len, arr[il_pos + 15].pw_len);
   #endif
 
   return pw_lenx;
 }
 
-DECLSPEC u32x ix_create_combt (GLOBAL_AS const pw_t *combs_buf, const u32 il_pos, const int idx)
+DECLSPEC u32x ix_create_combt (GLOBAL_AS const pw_t *arr, const u32 il_pos, const int idx)
 {
   #if   VECT_SIZE == 1
-  const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx]);
+  const u32x ix = make_u32x (arr[il_pos + 0].i[idx]);
   #elif VECT_SIZE == 2
-  const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx]);
+  const u32x ix = make_u32x (arr[il_pos + 0].i[idx], arr[il_pos + 1].i[idx]);
   #elif VECT_SIZE == 4
-  const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx], combs_buf[il_pos + 2].i[idx], combs_buf[il_pos + 3].i[idx]);
+  const u32x ix = make_u32x (arr[il_pos + 0].i[idx], arr[il_pos + 1].i[idx], arr[il_pos + 2].i[idx], arr[il_pos + 3].i[idx]);
   #elif VECT_SIZE == 8
-  const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx], combs_buf[il_pos + 2].i[idx], combs_buf[il_pos + 3].i[idx], combs_buf[il_pos + 4].i[idx], combs_buf[il_pos + 5].i[idx], combs_buf[il_pos + 6].i[idx], combs_buf[il_pos + 7].i[idx]);
+  const u32x ix = make_u32x (arr[il_pos + 0].i[idx], arr[il_pos + 1].i[idx], arr[il_pos + 2].i[idx], arr[il_pos + 3].i[idx], arr[il_pos + 4].i[idx], arr[il_pos + 5].i[idx], arr[il_pos + 6].i[idx], arr[il_pos + 7].i[idx]);
   #elif VECT_SIZE == 16
-  const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx], combs_buf[il_pos + 2].i[idx], combs_buf[il_pos + 3].i[idx], combs_buf[il_pos + 4].i[idx], combs_buf[il_pos + 5].i[idx], combs_buf[il_pos + 6].i[idx], combs_buf[il_pos + 7].i[idx], combs_buf[il_pos + 8].i[idx], combs_buf[il_pos + 9].i[idx], combs_buf[il_pos + 10].i[idx], combs_buf[il_pos + 11].i[idx], combs_buf[il_pos + 12].i[idx], combs_buf[il_pos + 13].i[idx], combs_buf[il_pos + 14].i[idx], combs_buf[il_pos + 15].i[idx]);
+  const u32x ix = make_u32x (arr[il_pos + 0].i[idx], arr[il_pos + 1].i[idx], arr[il_pos + 2].i[idx], arr[il_pos + 3].i[idx], arr[il_pos + 4].i[idx], arr[il_pos + 5].i[idx], arr[il_pos + 6].i[idx], arr[il_pos + 7].i[idx], arr[il_pos + 8].i[idx], arr[il_pos + 9].i[idx], arr[il_pos + 10].i[idx], arr[il_pos + 11].i[idx], arr[il_pos + 12].i[idx], arr[il_pos + 13].i[idx], arr[il_pos + 14].i[idx], arr[il_pos + 15].i[idx]);
   #endif
 
   return ix;
diff --git a/OpenCL/inc_simd.h b/OpenCL/inc_simd.h
index f8e87e4d2..89ba41dab 100644
--- a/OpenCL/inc_simd.h
+++ b/OpenCL/inc_simd.h
@@ -1050,51 +1050,51 @@
 #define MATCHES_NONE_VS(a,b) !(MATCHES_ONE_VS ((a), (b)))
 
 #if   VECT_SIZE == 1
-#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) *  1) + 0].var[(idx)])
+#define packv(arr,var,gid,idx) make_u32x ((arr)[((gid) *  1) + 0].var[(idx)])
 #elif VECT_SIZE == 2
-#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) *  2) + 0].var[(idx)], (arr)[((gid) *  2) + 1].var[(idx)])
+#define packv(arr,var,gid,idx) make_u32x ((arr)[((gid) *  2) + 0].var[(idx)], (arr)[((gid) *  2) + 1].var[(idx)])
 #elif VECT_SIZE == 4
-#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) *  4) + 0].var[(idx)], (arr)[((gid) *  4) + 1].var[(idx)], (arr)[((gid) *  4) + 2].var[(idx)], (arr)[((gid) *  4) + 3].var[(idx)])
+#define packv(arr,var,gid,idx) make_u32x ((arr)[((gid) *  4) + 0].var[(idx)], (arr)[((gid) *  4) + 1].var[(idx)], (arr)[((gid) *  4) + 2].var[(idx)], (arr)[((gid) *  4) + 3].var[(idx)])
 #elif VECT_SIZE == 8
-#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) *  8) + 0].var[(idx)], (arr)[((gid) *  8) + 1].var[(idx)], (arr)[((gid) *  8) + 2].var[(idx)], (arr)[((gid) *  8) + 3].var[(idx)], (arr)[((gid) *  8) + 4].var[(idx)], (arr)[((gid) *  8) + 5].var[(idx)], (arr)[((gid) *  8) + 6].var[(idx)], (arr)[((gid) *  8) + 7].var[(idx)])
+#define packv(arr,var,gid,idx) make_u32x ((arr)[((gid) *  8) + 0].var[(idx)], (arr)[((gid) *  8) + 1].var[(idx)], (arr)[((gid) *  8) + 2].var[(idx)], (arr)[((gid) *  8) + 3].var[(idx)], (arr)[((gid) *  8) + 4].var[(idx)], (arr)[((gid) *  8) + 5].var[(idx)], (arr)[((gid) *  8) + 6].var[(idx)], (arr)[((gid) *  8) + 7].var[(idx)])
 #elif VECT_SIZE == 16
-#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 16) + 0].var[(idx)], (arr)[((gid) * 16) + 1].var[(idx)], (arr)[((gid) * 16) + 2].var[(idx)], (arr)[((gid) * 16) + 3].var[(idx)], (arr)[((gid) * 16) + 4].var[(idx)], (arr)[((gid) * 16) + 5].var[(idx)], (arr)[((gid) * 16) + 6].var[(idx)], (arr)[((gid) * 16) + 7].var[(idx)], (arr)[((gid) * 16) + 8].var[(idx)], (arr)[((gid) * 16) + 9].var[(idx)], (arr)[((gid) * 16) + 10].var[(idx)], (arr)[((gid) * 16) + 11].var[(idx)], (arr)[((gid) * 16) + 12].var[(idx)], (arr)[((gid) * 16) + 13].var[(idx)], (arr)[((gid) * 16) + 14].var[(idx)], (arr)[((gid) * 16) + 15].var[(idx)])
+#define packv(arr,var,gid,idx) make_u32x ((arr)[((gid) * 16) + 0].var[(idx)], (arr)[((gid) * 16) + 1].var[(idx)], (arr)[((gid) * 16) + 2].var[(idx)], (arr)[((gid) * 16) + 3].var[(idx)], (arr)[((gid) * 16) + 4].var[(idx)], (arr)[((gid) * 16) + 5].var[(idx)], (arr)[((gid) * 16) + 6].var[(idx)], (arr)[((gid) * 16) + 7].var[(idx)], (arr)[((gid) * 16) + 8].var[(idx)], (arr)[((gid) * 16) + 9].var[(idx)], (arr)[((gid) * 16) + 10].var[(idx)], (arr)[((gid) * 16) + 11].var[(idx)], (arr)[((gid) * 16) + 12].var[(idx)], (arr)[((gid) * 16) + 13].var[(idx)], (arr)[((gid) * 16) + 14].var[(idx)], (arr)[((gid) * 16) + 15].var[(idx)])
 #endif
 
 #if   VECT_SIZE == 1
-#define pack64v(arr,var,gid,idx) (u64x) ((arr)[((gid) *  1) + 0].var[(idx)])
+#define pack64v(arr,var,gid,idx) make_u64x ((arr)[((gid) *  1) + 0].var[(idx)])
 #elif VECT_SIZE == 2
-#define pack64v(arr,var,gid,idx) (u64x) ((arr)[((gid) *  2) + 0].var[(idx)], (arr)[((gid) *  2) + 1].var[(idx)])
+#define pack64v(arr,var,gid,idx) make_u64x ((arr)[((gid) *  2) + 0].var[(idx)], (arr)[((gid) *  2) + 1].var[(idx)])
 #elif VECT_SIZE == 4
-#define pack64v(arr,var,gid,idx) (u64x) ((arr)[((gid) *  4) + 0].var[(idx)], (arr)[((gid) *  4) + 1].var[(idx)], (arr)[((gid) *  4) + 2].var[(idx)], (arr)[((gid) *  4) + 3].var[(idx)])
+#define pack64v(arr,var,gid,idx) make_u64x ((arr)[((gid) *  4) + 0].var[(idx)], (arr)[((gid) *  4) + 1].var[(idx)], (arr)[((gid) *  4) + 2].var[(idx)], (arr)[((gid) *  4) + 3].var[(idx)])
 #elif VECT_SIZE == 8
-#define pack64v(arr,var,gid,idx) (u64x) ((arr)[((gid) *  8) + 0].var[(idx)], (arr)[((gid) *  8) + 1].var[(idx)], (arr)[((gid) *  8) + 2].var[(idx)], (arr)[((gid) *  8) + 3].var[(idx)], (arr)[((gid) *  8) + 4].var[(idx)], (arr)[((gid) *  8) + 5].var[(idx)], (arr)[((gid) *  8) + 6].var[(idx)], (arr)[((gid) *  8) + 7].var[(idx)])
+#define pack64v(arr,var,gid,idx) make_u64x ((arr)[((gid) *  8) + 0].var[(idx)], (arr)[((gid) *  8) + 1].var[(idx)], (arr)[((gid) *  8) + 2].var[(idx)], (arr)[((gid) *  8) + 3].var[(idx)], (arr)[((gid) *  8) + 4].var[(idx)], (arr)[((gid) *  8) + 5].var[(idx)], (arr)[((gid) *  8) + 6].var[(idx)], (arr)[((gid) *  8) + 7].var[(idx)])
 #elif VECT_SIZE == 16
-#define pack64v(arr,var,gid,idx) (u64x) ((arr)[((gid) * 16) + 0].var[(idx)], (arr)[((gid) * 16) + 1].var[(idx)], (arr)[((gid) * 16) + 2].var[(idx)], (arr)[((gid) * 16) + 3].var[(idx)], (arr)[((gid) * 16) + 4].var[(idx)], (arr)[((gid) * 16) + 5].var[(idx)], (arr)[((gid) * 16) + 6].var[(idx)], (arr)[((gid) * 16) + 7].var[(idx)], (arr)[((gid) * 16) + 8].var[(idx)], (arr)[((gid) * 16) + 9].var[(idx)], (arr)[((gid) * 16) + 10].var[(idx)], (arr)[((gid) * 16) + 11].var[(idx)], (arr)[((gid) * 16) + 12].var[(idx)], (arr)[((gid) * 16) + 13].var[(idx)], (arr)[((gid) * 16) + 14].var[(idx)], (arr)[((gid) * 16) + 15].var[(idx)])
+#define pack64v(arr,var,gid,idx) make_u64x ((arr)[((gid) * 16) + 0].var[(idx)], (arr)[((gid) * 16) + 1].var[(idx)], (arr)[((gid) * 16) + 2].var[(idx)], (arr)[((gid) * 16) + 3].var[(idx)], (arr)[((gid) * 16) + 4].var[(idx)], (arr)[((gid) * 16) + 5].var[(idx)], (arr)[((gid) * 16) + 6].var[(idx)], (arr)[((gid) * 16) + 7].var[(idx)], (arr)[((gid) * 16) + 8].var[(idx)], (arr)[((gid) * 16) + 9].var[(idx)], (arr)[((gid) * 16) + 10].var[(idx)], (arr)[((gid) * 16) + 11].var[(idx)], (arr)[((gid) * 16) + 12].var[(idx)], (arr)[((gid) * 16) + 13].var[(idx)], (arr)[((gid) * 16) + 14].var[(idx)], (arr)[((gid) * 16) + 15].var[(idx)])
 #endif
 
 #if   VECT_SIZE == 1
-#define packvf(arr,var,gid) (u32x) ((arr)[((gid) *  1) + 0].var)
+#define packvf(arr,var,gid) make_u32x ((arr)[((gid) *  1) + 0].var)
 #elif VECT_SIZE == 2
-#define packvf(arr,var,gid) (u32x) ((arr)[((gid) *  2) + 0].var, (arr)[((gid) *  2) + 1].var)
+#define packvf(arr,var,gid) make_u32x ((arr)[((gid) *  2) + 0].var, (arr)[((gid) *  2) + 1].var)
 #elif VECT_SIZE == 4
-#define packvf(arr,var,gid) (u32x) ((arr)[((gid) *  4) + 0].var, (arr)[((gid) *  4) + 1].var, (arr)[((gid) *  4) + 2].var, (arr)[((gid) *  4) + 3].var)
+#define packvf(arr,var,gid) make_u32x ((arr)[((gid) *  4) + 0].var, (arr)[((gid) *  4) + 1].var, (arr)[((gid) *  4) + 2].var, (arr)[((gid) *  4) + 3].var)
 #elif VECT_SIZE == 8
-#define packvf(arr,var,gid) (u32x) ((arr)[((gid) *  8) + 0].var, (arr)[((gid) *  8) + 1].var, (arr)[((gid) *  8) + 2].var, (arr)[((gid) *  8) + 3].var, (arr)[((gid) *  8) + 4].var, (arr)[((gid) *  8) + 5].var, (arr)[((gid) *  8) + 6].var, (arr)[((gid) *  8) + 7].var)
+#define packvf(arr,var,gid) make_u32x ((arr)[((gid) *  8) + 0].var, (arr)[((gid) *  8) + 1].var, (arr)[((gid) *  8) + 2].var, (arr)[((gid) *  8) + 3].var, (arr)[((gid) *  8) + 4].var, (arr)[((gid) *  8) + 5].var, (arr)[((gid) *  8) + 6].var, (arr)[((gid) *  8) + 7].var)
 #elif VECT_SIZE == 16
-#define packvf(arr,var,gid) (u32x) ((arr)[((gid) * 16) + 0].var, (arr)[((gid) * 16) + 1].var, (arr)[((gid) * 16) + 2].var, (arr)[((gid) * 16) + 3].var, (arr)[((gid) * 16) + 4].var, (arr)[((gid) * 16) + 5].var, (arr)[((gid) * 16) + 6].var, (arr)[((gid) * 16) + 7].var, (arr)[((gid) * 16) + 8].var, (arr)[((gid) * 16) + 9].var, (arr)[((gid) * 16) + 10].var, (arr)[((gid) * 16) + 11].var, (arr)[((gid) * 16) + 12].var, (arr)[((gid) * 16) + 13].var, (arr)[((gid) * 16) + 14].var, (arr)[((gid) * 16) + 15].var)
+#define packvf(arr,var,gid) make_u32x ((arr)[((gid) * 16) + 0].var, (arr)[((gid) * 16) + 1].var, (arr)[((gid) * 16) + 2].var, (arr)[((gid) * 16) + 3].var, (arr)[((gid) * 16) + 4].var, (arr)[((gid) * 16) + 5].var, (arr)[((gid) * 16) + 6].var, (arr)[((gid) * 16) + 7].var, (arr)[((gid) * 16) + 8].var, (arr)[((gid) * 16) + 9].var, (arr)[((gid) * 16) + 10].var, (arr)[((gid) * 16) + 11].var, (arr)[((gid) * 16) + 12].var, (arr)[((gid) * 16) + 13].var, (arr)[((gid) * 16) + 14].var, (arr)[((gid) * 16) + 15].var)
 #endif
 
 #if   VECT_SIZE == 1
-#define pack64vf(arr,var,gid) (u64x) ((arr)[((gid) *  1) + 0].var)
+#define pack64vf(arr,var,gid) make_u64x ((arr)[((gid) *  1) + 0].var)
 #elif VECT_SIZE == 2
-#define pack64vf(arr,var,gid) (u64x) ((arr)[((gid) *  2) + 0].var, (arr)[((gid) *  2) + 1].var)
+#define pack64vf(arr,var,gid) make_u64x ((arr)[((gid) *  2) + 0].var, (arr)[((gid) *  2) + 1].var)
 #elif VECT_SIZE == 4
-#define pack64vf(arr,var,gid) (u64x) ((arr)[((gid) *  4) + 0].var, (arr)[((gid) *  4) + 1].var, (arr)[((gid) *  4) + 2].var, (arr)[((gid) *  4) + 3].var)
+#define pack64vf(arr,var,gid) make_u64x ((arr)[((gid) *  4) + 0].var, (arr)[((gid) *  4) + 1].var, (arr)[((gid) *  4) + 2].var, (arr)[((gid) *  4) + 3].var)
 #elif VECT_SIZE == 8
-#define pack64vf(arr,var,gid) (u64x) ((arr)[((gid) *  8) + 0].var, (arr)[((gid) *  8) + 1].var, (arr)[((gid) *  8) + 2].var, (arr)[((gid) *  8) + 3].var, (arr)[((gid) *  8) + 4].var, (arr)[((gid) *  8) + 5].var, (arr)[((gid) *  8) + 6].var, (arr)[((gid) *  8) + 7].var)
+#define pack64vf(arr,var,gid) make_u64x ((arr)[((gid) *  8) + 0].var, (arr)[((gid) *  8) + 1].var, (arr)[((gid) *  8) + 2].var, (arr)[((gid) *  8) + 3].var, (arr)[((gid) *  8) + 4].var, (arr)[((gid) *  8) + 5].var, (arr)[((gid) *  8) + 6].var, (arr)[((gid) *  8) + 7].var)
 #elif VECT_SIZE == 16
-#define pack64vf(arr,var,gid) (u64x) ((arr)[((gid) * 16) + 0].var, (arr)[((gid) * 16) + 1].var, (arr)[((gid) * 16) + 2].var, (arr)[((gid) * 16) + 3].var, (arr)[((gid) * 16) + 4].var, (arr)[((gid) * 16) + 5].var, (arr)[((gid) * 16) + 6].var, (arr)[((gid) * 16) + 7].var, (arr)[((gid) * 16) + 8].var, (arr)[((gid) * 16) + 9].var, (arr)[((gid) * 16) + 10].var, (arr)[((gid) * 16) + 11].var, (arr)[((gid) * 16) + 12].var, (arr)[((gid) * 16) + 13].var, (arr)[((gid) * 16) + 14].var, (arr)[((gid) * 16) + 15].var)
+#define pack64vf(arr,var,gid) make_u64x ((arr)[((gid) * 16) + 0].var, (arr)[((gid) * 16) + 1].var, (arr)[((gid) * 16) + 2].var, (arr)[((gid) * 16) + 3].var, (arr)[((gid) * 16) + 4].var, (arr)[((gid) * 16) + 5].var, (arr)[((gid) * 16) + 6].var, (arr)[((gid) * 16) + 7].var, (arr)[((gid) * 16) + 8].var, (arr)[((gid) * 16) + 9].var, (arr)[((gid) * 16) + 10].var, (arr)[((gid) * 16) + 11].var, (arr)[((gid) * 16) + 12].var, (arr)[((gid) * 16) + 13].var, (arr)[((gid) * 16) + 14].var, (arr)[((gid) * 16) + 15].var)
 #endif
 
 #if   VECT_SIZE == 1
@@ -1133,8 +1133,8 @@
 #define unpackv_xor(arr,var,gid,idx,val) (arr)[((gid) * 16) + 0].var[(idx)] ^= val.s0; (arr)[((gid) * 16) + 1].var[(idx)] ^= val.s1; (arr)[((gid) * 16) + 2].var[(idx)] ^= val.s2; (arr)[((gid) * 16) + 3].var[(idx)] ^= val.s3; (arr)[((gid) * 16) + 4].var[(idx)] ^= val.s4; (arr)[((gid) * 16) + 5].var[(idx)] ^= val.s5; (arr)[((gid) * 16) + 6].var[(idx)] ^= val.s6; (arr)[((gid) * 16) + 7].var[(idx)] ^= val.s7; (arr)[((gid) * 16) + 8].var[(idx)] ^= val.s8; (arr)[((gid) * 16) + 9].var[(idx)] ^= val.s9; (arr)[((gid) * 16) + 10].var[(idx)] ^= val.sa; (arr)[((gid) * 16) + 11].var[(idx)] ^= val.sb; (arr)[((gid) * 16) + 12].var[(idx)] ^= val.sc; (arr)[((gid) * 16) + 13].var[(idx)] ^= val.sd; (arr)[((gid) * 16) + 14].var[(idx)] ^= val.se; (arr)[((gid) * 16) + 15].var[(idx)] ^= val.sf;
 #endif
 
-DECLSPEC u32x ix_create_bft (GLOBAL_AS const bf_t *bfs_buf, const u32 il_pos);
-DECLSPEC u32x pwlenx_create_combt (GLOBAL_AS const pw_t *combs_buf, const u32 il_pos);
-DECLSPEC u32x ix_create_combt (GLOBAL_AS const pw_t *combs_buf, const u32 il_pos, const int idx);
+DECLSPEC u32x ix_create_bft       (CONSTANT_AS const bf_t *arr, const u32 il_pos);
+DECLSPEC u32x pwlenx_create_combt (GLOBAL_AS   const pw_t *arr, const u32 il_pos);
+DECLSPEC u32x ix_create_combt     (GLOBAL_AS   const pw_t *arr, const u32 il_pos, const int idx);
 
 #endif
diff --git a/OpenCL/inc_truecrypt_crc32.cl b/OpenCL/inc_truecrypt_crc32.cl
index db0204b12..2bc30bffa 100644
--- a/OpenCL/inc_truecrypt_crc32.cl
+++ b/OpenCL/inc_truecrypt_crc32.cl
@@ -5,10 +5,11 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_truecrypt_crc32.h"
 
-CONSTANT_AS u32a crc32tab[0x100] =
+CONSTANT_VK u32a crc32tab[0x100] =
 {
   0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
   0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
diff --git a/OpenCL/inc_truecrypt_keyfile.cl b/OpenCL/inc_truecrypt_keyfile.cl
index faf287b5e..4822c811a 100644
--- a/OpenCL/inc_truecrypt_keyfile.cl
+++ b/OpenCL/inc_truecrypt_keyfile.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_truecrypt_keyfile.h"
 
diff --git a/OpenCL/inc_truecrypt_xts.cl b/OpenCL/inc_truecrypt_xts.cl
index 4e72d4e79..ceeff8842 100644
--- a/OpenCL/inc_truecrypt_xts.cl
+++ b/OpenCL/inc_truecrypt_xts.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_cipher_aes.h"
 #include "inc_cipher_serpent.h"
diff --git a/OpenCL/inc_types.h b/OpenCL/inc_types.h
index b9eb3bd03..dca2a3b5a 100644
--- a/OpenCL/inc_types.h
+++ b/OpenCL/inc_types.h
@@ -6,6 +6,14 @@
 #ifndef _INC_TYPES_H
 #define _INC_TYPES_H
 
+#ifdef IS_CUDA
+//https://docs.nvidia.com/cuda/nvrtc/index.html#integer-size
+typedef unsigned char      uchar;
+typedef unsigned short     ushort;
+typedef unsigned int       uint;
+typedef unsigned long long ulong;
+#endif
+
 #ifdef KERNEL_STATIC
 typedef uchar  u8;
 typedef ushort u16;
@@ -43,11 +51,757 @@ typedef u8   u8x;
 typedef u16  u16x;
 typedef u32  u32x;
 typedef u64  u64x;
+
+#define make_u8x  (u8)
+#define make_u16x (u16)
+#define make_u32x (u32)
+#define make_u64x (u64)
+
+#else
+#ifdef IS_CUDA
+
+#if VECT_SIZE == 2
+
+struct __device_builtin__ __builtin_align__(2) u8x
+{
+  u8 s0;
+  u8 s1;
+
+  inline __device__  u8x (const u8 a, const u8 b) : s0(a), s1(b) { }
+  inline __device__  u8x (const u8 a)             : s0(a), s1(a) { }
+
+  inline __device__  u8x (void) : s0(0), s1(0) { }
+  inline __device__ ~u8x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(4) u16x
+{
+  u16 s0;
+  u16 s1;
+
+  inline __device__  u16x (const u16 a, const u16 b) : s0(a), s1(b) { }
+  inline __device__  u16x (const u16 a)              : s0(a), s1(a) { }
+
+  inline __device__  u16x (void) : s0(0), s1(0) { }
+  inline __device__ ~u16x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(8) u32x
+{
+  u32 s0;
+  u32 s1;
+
+  inline __device__  u32x (const u32 a, const u32 b) : s0(a), s1(b) { }
+  inline __device__  u32x (const u32 a)              : s0(a), s1(a) { }
+
+  inline __device__  u32x (void) : s0(0), s1(0) { }
+  inline __device__ ~u32x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(16) u64x
+{
+  u64 s0;
+  u64 s1;
+
+  inline __device__  u64x (const u64 a, const u64 b) : s0(a), s1(b) { }
+  inline __device__  u64x (const u64 a)              : s0(a), s1(a) { }
+
+  inline __device__  u64x (void) : s0(0), s1(0) { }
+  inline __device__ ~u64x (void) { }
+};
+
+inline __device__ bool operator != (const u32x a, const u32  b) { return ((a.s0 != b)    && (a.s1 != b));    }
+inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1)); }
+
+inline __device__ void operator ^= (u32x &a, const u32  b) { a.s0 ^= b;    a.s1 ^= b;     }
+inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1;  }
+
+inline __device__ void operator |= (u32x &a, const u32  b) { a.s0 |= b;    a.s1 |= b;     }
+inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1;  }
+
+inline __device__ void operator &= (u32x &a, const u32  b) { a.s0 &= b;    a.s1 &= b;     }
+inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1;  }
+
+inline __device__ void operator += (u32x &a, const u32  b) { a.s0 += b;    a.s1 += b;     }
+inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1;  }
+
+inline __device__ void operator -= (u32x &a, const u32  b) { a.s0 -= b;    a.s1 -= b;     }
+inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1;  }
+
+inline __device__ void operator *= (u32x &a, const u32  b) { a.s0 *= b;    a.s1 *= b;     }
+inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1;  }
+
+inline __device__ void operator >>= (u32x &a, const u32  b) { a.s0 >>= b;    a.s1 >>= b;     }
+inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1;  }
+
+inline __device__ void operator <<= (u32x &a, const u32  b) { a.s0 <<= b;    a.s1 <<= b;     }
+inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1;  }
+
+inline __device__ u32x operator << (const u32x a, const u32  b) { return u32x ((a.s0 << b),    (a.s1 << b)   );  }
+inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1));  }
+
+inline __device__ u32x operator >> (const u32x a, const u32  b) { return u32x ((a.s0 >> b),    (a.s1 >> b)   );  }
+inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1));  }
+
+inline __device__ u32x operator ^  (const u32x a, const u32  b) { return u32x ((a.s0 ^  b),    (a.s1 ^  b)   );  }
+inline __device__ u32x operator ^  (const u32x a, const u32x b) { return u32x ((a.s0 ^  b.s0), (a.s1 ^  b.s1));  }
+
+inline __device__ u32x operator |  (const u32x a, const u32  b) { return u32x ((a.s0 |  b),    (a.s1 |  b)   );  }
+inline __device__ u32x operator |  (const u32x a, const u32x b) { return u32x ((a.s0 |  b.s0), (a.s1 |  b.s1));  }
+
+inline __device__ u32x operator &  (const u32x a, const u32  b) { return u32x ((a.s0 &  b),    (a.s1 &  b)   );  }
+inline __device__ u32x operator &  (const u32x a, const u32x b) { return u32x ((a.s0 &  b.s0), (a.s1 &  b.s1));  }
+
+inline __device__ u32x operator +  (const u32x a, const u32  b) { return u32x ((a.s0 +  b),    (a.s1 +  b)   );  }
+inline __device__ u32x operator +  (const u32x a, const u32x b) { return u32x ((a.s0 +  b.s0), (a.s1 +  b.s1));  }
+
+inline __device__ u32x operator -  (const u32x a, const u32  b) { return u32x ((a.s0 -  b),    (a.s1 -  b)   );  }
+inline __device__ u32x operator -  (const u32x a, const u32x b) { return u32x ((a.s0 -  b.s0), (a.s1 -  b.s1));  }
+
+inline __device__ u32x operator *  (const u32x a, const u32  b) { return u32x ((a.s0 *  b),    (a.s1 *  b)   );  }
+inline __device__ u32x operator *  (const u32x a, const u32x b) { return u32x ((a.s0 *  b.s0), (a.s1 *  b.s1));  }
+
+inline __device__ u32x operator ~  (const u32x a) { return u32x (~a.s0, ~a.s1); }
+
+inline __device__ bool operator != (const u64x a, const u64  b) { return ((a.s0 != b)    && (a.s1 != b));    }
+inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1)); }
+
+inline __device__ void operator ^= (u64x &a, const u64  b) { a.s0 ^= b;    a.s1 ^= b;     }
+inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1;  }
+
+inline __device__ void operator |= (u64x &a, const u64  b) { a.s0 |= b;    a.s1 |= b;     }
+inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1;  }
+
+inline __device__ void operator &= (u64x &a, const u64  b) { a.s0 &= b;    a.s1 &= b;     }
+inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1;  }
+
+inline __device__ void operator += (u64x &a, const u64  b) { a.s0 += b;    a.s1 += b;     }
+inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1;  }
+
+inline __device__ void operator -= (u64x &a, const u64  b) { a.s0 -= b;    a.s1 -= b;     }
+inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1;  }
+
+inline __device__ void operator *= (u64x &a, const u64  b) { a.s0 *= b;    a.s1 *= b;     }
+inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1;  }
+
+inline __device__ void operator >>= (u64x &a, const u64  b) { a.s0 >>= b;    a.s1 >>= b;     }
+inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1;  }
+
+inline __device__ void operator <<= (u64x &a, const u64  b) { a.s0 <<= b;    a.s1 <<= b;     }
+inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1;  }
+
+inline __device__ u64x operator << (const u64x a, const u64  b) { return u64x ((a.s0 << b),    (a.s1 << b)   );  }
+inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1));  }
+
+inline __device__ u64x operator >> (const u64x a, const u64  b) { return u64x ((a.s0 >> b),    (a.s1 >> b)   );  }
+inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1));  }
+
+inline __device__ u64x operator ^  (const u64x a, const u64  b) { return u64x ((a.s0 ^  b),    (a.s1 ^  b)   );  }
+inline __device__ u64x operator ^  (const u64x a, const u64x b) { return u64x ((a.s0 ^  b.s0), (a.s1 ^  b.s1));  }
+
+inline __device__ u64x operator |  (const u64x a, const u64  b) { return u64x ((a.s0 |  b),    (a.s1 |  b)   );  }
+inline __device__ u64x operator |  (const u64x a, const u64x b) { return u64x ((a.s0 |  b.s0), (a.s1 |  b.s1));  }
+
+inline __device__ u64x operator &  (const u64x a, const u64  b) { return u64x ((a.s0 &  b),    (a.s1 &  b)   );  }
+inline __device__ u64x operator &  (const u64x a, const u64x b) { return u64x ((a.s0 &  b.s0), (a.s1 &  b.s1));  }
+
+inline __device__ u64x operator +  (const u64x a, const u64  b) { return u64x ((a.s0 +  b),    (a.s1 +  b)   );  }
+inline __device__ u64x operator +  (const u64x a, const u64x b) { return u64x ((a.s0 +  b.s0), (a.s1 +  b.s1));  }
+
+inline __device__ u64x operator -  (const u64x a, const u64  b) { return u64x ((a.s0 -  b),    (a.s1 -  b)   );  }
+inline __device__ u64x operator -  (const u64x a, const u64x b) { return u64x ((a.s0 -  b.s0), (a.s1 -  b.s1));  }
+
+inline __device__ u64x operator *  (const u64x a, const u64  b) { return u64x ((a.s0 *  b),    (a.s1 *  b)   );  }
+inline __device__ u64x operator *  (const u64x a, const u64x b) { return u64x ((a.s0 *  b.s0), (a.s1 *  b.s1));  }
+
+inline __device__ u64x operator ~  (const u64x a) { return u64x (~a.s0, ~a.s1); }
+
+#endif
+
+#if VECT_SIZE == 4
+
+struct __device_builtin__ __builtin_align__(4) u8x
+{
+  u8 s0;
+  u8 s1;
+  u8 s2;
+  u8 s3;
+
+  inline __device__  u8x (const u8 a, const u8 b, const u8 c, const u8 d) : s0(a), s1(b), s2(c), s3(d) { }
+  inline __device__  u8x (const u8 a)                                     : s0(a), s1(a), s2(a), s3(a) { }
+
+  inline __device__  u8x (void) : s0(0), s1(0), s2(0), s3(0) { }
+  inline __device__ ~u8x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(8) u16x
+{
+  u16 s0;
+  u16 s1;
+  u16 s2;
+  u16 s3;
+
+  inline __device__  u16x (const u16 a, const u16 b, const u16 c, const u16 d) : s0(a), s1(b), s2(c), s3(d) { }
+  inline __device__  u16x (const u16 a)                                        : s0(a), s1(a), s2(a), s3(a) { }
+
+  inline __device__  u16x (void) : s0(0), s1(0), s2(0), s3(0) { }
+  inline __device__ ~u16x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(16) u32x
+{
+  u32 s0;
+  u32 s1;
+  u32 s2;
+  u32 s3;
+
+  inline __device__  u32x (const u32 a, const u32 b, const u32 c, const u32 d) : s0(a), s1(b), s2(c), s3(d) { }
+  inline __device__  u32x (const u32 a)                                        : s0(a), s1(a), s2(a), s3(a) { }
+
+  inline __device__  u32x (void) : s0(0), s1(0), s2(0), s3(0) { }
+  inline __device__ ~u32x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(32) u64x
+{
+  u64 s0;
+  u64 s1;
+  u64 s2;
+  u64 s3;
+
+  inline __device__  u64x (const u64 a, const u64 b, const u64 c, const u64 d) : s0(a), s1(b), s2(c), s3(d) { }
+  inline __device__  u64x (const u64 a)                                        : s0(a), s1(a), s2(a), s3(a) { }
+
+  inline __device__  u64x (void) : s0(0), s1(0), s2(0), s3(0) { }
+  inline __device__ ~u64x (void) { }
+};
+
+inline __device__ bool operator != (const u32x a, const u32  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)   ); }
+inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3)); }
+
+inline __device__ void operator ^= (u32x &a, const u32  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;     }
+inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3;  }
+
+inline __device__ void operator |= (u32x &a, const u32  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;     }
+inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3;  }
+
+inline __device__ void operator &= (u32x &a, const u32  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;     }
+inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3;  }
+
+inline __device__ void operator += (u32x &a, const u32  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;     }
+inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3;  }
+
+inline __device__ void operator -= (u32x &a, const u32  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;     }
+inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3;  }
+
+inline __device__ void operator *= (u32x &a, const u32  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;     }
+inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3;  }
+
+inline __device__ void operator >>= (u32x &a, const u32  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;     }
+inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3;  }
+
+inline __device__ void operator <<= (u32x &a, const u32  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;     }
+inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3;  }
+
+inline __device__ u32x operator << (const u32x a, const u32  b) { return u32x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   );  }
+inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3));  }
+
+inline __device__ u32x operator >> (const u32x a, const u32  b) { return u32x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   );  }
+inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3));  }
+
+inline __device__ u32x operator ^  (const u32x a, const u32  b) { return u32x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   );  }
+inline __device__ u32x operator ^  (const u32x a, const u32x b) { return u32x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3));  }
+
+inline __device__ u32x operator |  (const u32x a, const u32  b) { return u32x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   );  }
+inline __device__ u32x operator |  (const u32x a, const u32x b) { return u32x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3));  }
+
+inline __device__ u32x operator &  (const u32x a, const u32  b) { return u32x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   );  }
+inline __device__ u32x operator &  (const u32x a, const u32x b) { return u32x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3));  }
+
+inline __device__ u32x operator +  (const u32x a, const u32  b) { return u32x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   );  }
+inline __device__ u32x operator +  (const u32x a, const u32x b) { return u32x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3));  }
+
+inline __device__ u32x operator -  (const u32x a, const u32  b) { return u32x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   );  }
+inline __device__ u32x operator -  (const u32x a, const u32x b) { return u32x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3));  }
+
+inline __device__ u32x operator *  (const u32x a, const u32  b) { return u32x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   );  }
+inline __device__ u32x operator *  (const u32x a, const u32x b) { return u32x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3));  }
+
+inline __device__ u32x operator ~  (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3); }
+
+inline __device__ bool operator != (const u64x a, const u64  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)   ); }
+inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3)); }
+
+inline __device__ void operator ^= (u64x &a, const u64  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;     }
+inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3;  }
+
+inline __device__ void operator |= (u64x &a, const u64  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;     }
+inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3;  }
+
+inline __device__ void operator &= (u64x &a, const u64  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;     }
+inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3;  }
+
+inline __device__ void operator += (u64x &a, const u64  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;     }
+inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3;  }
+
+inline __device__ void operator -= (u64x &a, const u64  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;     }
+inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3;  }
+
+inline __device__ void operator *= (u64x &a, const u64  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;     }
+inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3;  }
+
+inline __device__ void operator >>= (u64x &a, const u64  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;     }
+inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3;  }
+
+inline __device__ void operator <<= (u64x &a, const u64  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;     }
+inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3;  }
+
+inline __device__ u64x operator << (const u64x a, const u64  b) { return u64x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   );  }
+inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3));  }
+
+inline __device__ u64x operator >> (const u64x a, const u64  b) { return u64x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   );  }
+inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3));  }
+
+inline __device__ u64x operator ^  (const u64x a, const u64  b) { return u64x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   );  }
+inline __device__ u64x operator ^  (const u64x a, const u64x b) { return u64x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3));  }
+
+inline __device__ u64x operator |  (const u64x a, const u64  b) { return u64x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   );  }
+inline __device__ u64x operator |  (const u64x a, const u64x b) { return u64x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3));  }
+
+inline __device__ u64x operator &  (const u64x a, const u64  b) { return u64x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   );  }
+inline __device__ u64x operator &  (const u64x a, const u64x b) { return u64x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3));  }
+
+inline __device__ u64x operator +  (const u64x a, const u64  b) { return u64x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   );  }
+inline __device__ u64x operator +  (const u64x a, const u64x b) { return u64x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3));  }
+
+inline __device__ u64x operator -  (const u64x a, const u64  b) { return u64x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   );  }
+inline __device__ u64x operator -  (const u64x a, const u64x b) { return u64x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3));  }
+
+inline __device__ u64x operator *  (const u64x a, const u64  b) { return u64x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   );  }
+inline __device__ u64x operator *  (const u64x a, const u64x b) { return u64x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3));  }
+
+inline __device__ u64x operator ~  (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3); }
+
+#endif
+
+#if VECT_SIZE == 8
+
+struct __device_builtin__ __builtin_align__(8) u8x
+{
+  u8 s0;
+  u8 s1;
+  u8 s2;
+  u8 s3;
+  u8 s4;
+  u8 s5;
+  u8 s6;
+  u8 s7;
+
+  inline __device__  u8x (const u8 a, const u8 b, const u8 c, const u8 d, const u8 e, const u8 f, const u8 g, const u8 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
+  inline __device__  u8x (const u8 a)                                                                                     : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
+
+  inline __device__  u8x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
+  inline __device__ ~u8x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(16) u16x
+{
+  u16 s0;
+  u16 s1;
+  u16 s2;
+  u16 s3;
+  u16 s4;
+  u16 s5;
+  u16 s6;
+  u16 s7;
+
+  inline __device__  u16x (const u16 a, const u16 b, const u16 c, const u16 d, const u16 e, const u16 f, const u16 g, const u16 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
+  inline __device__  u16x (const u16 a)                                                                                            : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
+
+  inline __device__  u16x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
+  inline __device__ ~u16x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(32) u32x
+{
+  u32 s0;
+  u32 s1;
+  u32 s2;
+  u32 s3;
+  u32 s4;
+  u32 s5;
+  u32 s6;
+  u32 s7;
+
+  inline __device__  u32x (const u32 a, const u32 b, const u32 c, const u32 d, const u32 e, const u32 f, const u32 g, const u32 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
+  inline __device__  u32x (const u32 a)                                                                                            : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
+
+  inline __device__  u32x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
+  inline __device__ ~u32x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(64) u64x
+{
+  u64 s0;
+  u64 s1;
+  u64 s2;
+  u64 s3;
+  u64 s4;
+  u64 s5;
+  u64 s6;
+  u64 s7;
+
+  inline __device__  u64x (const u64 a, const u64 b, const u64 c, const u64 d, const u64 e, const u64 f, const u64 g, const u64 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
+  inline __device__  u64x (const u64 a)                                                                                            : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
+
+  inline __device__  u64x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
+  inline __device__ ~u64x (void) { }
+};
+
+inline __device__ bool operator != (const u32x a, const u32  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)    && (a.s4 != b)    && (a.s5 != b)    && (a.s6 != b)    && (a.s7 != b)   ); }
+inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7)); }
+
+inline __device__ void operator ^= (u32x &a, const u32  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;    a.s4 ^= b;    a.s5 ^= b;    a.s6 ^= b;    a.s7 ^= b;     }
+inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7;  }
+
+inline __device__ void operator |= (u32x &a, const u32  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;    a.s4 |= b;    a.s5 |= b;    a.s6 |= b;    a.s7 |= b;     }
+inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7;  }
+
+inline __device__ void operator &= (u32x &a, const u32  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;    a.s4 &= b;    a.s5 &= b;    a.s6 &= b;    a.s7 &= b;     }
+inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7;  }
+
+inline __device__ void operator += (u32x &a, const u32  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;    a.s4 += b;    a.s5 += b;    a.s6 += b;    a.s7 += b;     }
+inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7;  }
+
+inline __device__ void operator -= (u32x &a, const u32  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;    a.s4 -= b;    a.s5 -= b;    a.s6 -= b;    a.s7 -= b;     }
+inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7;  }
+
+inline __device__ void operator *= (u32x &a, const u32  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;    a.s4 *= b;    a.s5 *= b;    a.s6 *= b;    a.s7 *= b;     }
+inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7;  }
+
+inline __device__ void operator >>= (u32x &a, const u32  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;    a.s4 >>= b;    a.s5 >>= b;    a.s6 >>= b;    a.s7 >>= b;     }
+inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7;  }
+
+inline __device__ void operator <<= (u32x &a, const u32  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;    a.s4 <<= b;    a.s5 <<= b;    a.s6 <<= b;    a.s7 <<= b;     }
+inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7;  }
+
+inline __device__ u32x operator << (const u32x a, const u32  b) { return u32x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   , (a.s4 << b),    (a.s5 << b)   , (a.s6 << b),    (a.s7 << b)   );  }
+inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7));  }
+
+inline __device__ u32x operator >> (const u32x a, const u32  b) { return u32x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   , (a.s4 >> b),    (a.s5 >> b)   , (a.s6 >> b),    (a.s7 >> b)   );  }
+inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7));  }
+
+inline __device__ u32x operator ^  (const u32x a, const u32  b) { return u32x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   , (a.s4 ^  b),    (a.s5 ^  b)   , (a.s6 ^  b),    (a.s7 ^  b)   );  }
+inline __device__ u32x operator ^  (const u32x a, const u32x b) { return u32x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3), (a.s4 ^  b.s4), (a.s5 ^  b.s5), (a.s6 ^  b.s6), (a.s7 ^  b.s7));  }
+
+inline __device__ u32x operator |  (const u32x a, const u32  b) { return u32x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   , (a.s4 |  b),    (a.s5 |  b)   , (a.s6 |  b),    (a.s7 |  b)   );  }
+inline __device__ u32x operator |  (const u32x a, const u32x b) { return u32x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3), (a.s4 |  b.s4), (a.s5 |  b.s5), (a.s6 |  b.s6), (a.s7 |  b.s7));  }
+
+inline __device__ u32x operator &  (const u32x a, const u32  b) { return u32x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   , (a.s4 &  b),    (a.s5 &  b)   , (a.s6 &  b),    (a.s7 &  b)   );  }
+inline __device__ u32x operator &  (const u32x a, const u32x b) { return u32x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3), (a.s4 &  b.s4), (a.s5 &  b.s5), (a.s6 &  b.s6), (a.s7 &  b.s7));  }
+
+inline __device__ u32x operator +  (const u32x a, const u32  b) { return u32x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   , (a.s4 +  b),    (a.s5 +  b)   , (a.s6 +  b),    (a.s7 +  b)   );  }
+inline __device__ u32x operator +  (const u32x a, const u32x b) { return u32x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3), (a.s4 +  b.s4), (a.s5 +  b.s5), (a.s6 +  b.s6), (a.s7 +  b.s7));  }
+
+inline __device__ u32x operator -  (const u32x a, const u32  b) { return u32x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   , (a.s4 -  b),    (a.s5 -  b)   , (a.s6 -  b),    (a.s7 -  b)   );  }
+inline __device__ u32x operator -  (const u32x a, const u32x b) { return u32x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3), (a.s4 -  b.s4), (a.s5 -  b.s5), (a.s6 -  b.s6), (a.s7 -  b.s7));  }
+
+inline __device__ u32x operator *  (const u32x a, const u32  b) { return u32x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   , (a.s4 *  b),    (a.s5 *  b)   , (a.s6 *  b),    (a.s7 *  b)   );  }
+inline __device__ u32x operator *  (const u32x a, const u32x b) { return u32x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3), (a.s4 *  b.s4), (a.s5 *  b.s5), (a.s6 *  b.s6), (a.s7 *  b.s7));  }
+
+inline __device__ u32x operator ~  (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7); }
+
+inline __device__ bool operator != (const u64x a, const u64  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)    && (a.s4 != b)    && (a.s5 != b)    && (a.s6 != b)    && (a.s7 != b)   ); }
+inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7)); }
+
+inline __device__ void operator ^= (u64x &a, const u64  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;    a.s4 ^= b;    a.s5 ^= b;    a.s6 ^= b;    a.s7 ^= b;     }
+inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7;  }
+
+inline __device__ void operator |= (u64x &a, const u64  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;    a.s4 |= b;    a.s5 |= b;    a.s6 |= b;    a.s7 |= b;     }
+inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7;  }
+
+inline __device__ void operator &= (u64x &a, const u64  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;    a.s4 &= b;    a.s5 &= b;    a.s6 &= b;    a.s7 &= b;     }
+inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7;  }
+
+inline __device__ void operator += (u64x &a, const u64  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;    a.s4 += b;    a.s5 += b;    a.s6 += b;    a.s7 += b;     }
+inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7;  }
+
+inline __device__ void operator -= (u64x &a, const u64  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;    a.s4 -= b;    a.s5 -= b;    a.s6 -= b;    a.s7 -= b;     }
+inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7;  }
+
+inline __device__ void operator *= (u64x &a, const u64  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;    a.s4 *= b;    a.s5 *= b;    a.s6 *= b;    a.s7 *= b;     }
+inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7;  }
+
+inline __device__ void operator >>= (u64x &a, const u64  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;    a.s4 >>= b;    a.s5 >>= b;    a.s6 >>= b;    a.s7 >>= b;     }
+inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7;  }
+
+inline __device__ void operator <<= (u64x &a, const u64  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;    a.s4 <<= b;    a.s5 <<= b;    a.s6 <<= b;    a.s7 <<= b;     }
+inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7;  }
+
+inline __device__ u64x operator << (const u64x a, const u64  b) { return u64x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   , (a.s4 << b),    (a.s5 << b)   , (a.s6 << b),    (a.s7 << b)   );  }
+inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7));  }
+
+inline __device__ u64x operator >> (const u64x a, const u64  b) { return u64x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   , (a.s4 >> b),    (a.s5 >> b)   , (a.s6 >> b),    (a.s7 >> b)   );  }
+inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7));  }
+
+inline __device__ u64x operator ^  (const u64x a, const u64  b) { return u64x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   , (a.s4 ^  b),    (a.s5 ^  b)   , (a.s6 ^  b),    (a.s7 ^  b)   );  }
+inline __device__ u64x operator ^  (const u64x a, const u64x b) { return u64x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3), (a.s4 ^  b.s4), (a.s5 ^  b.s5), (a.s6 ^  b.s6), (a.s7 ^  b.s7));  }
+
+inline __device__ u64x operator |  (const u64x a, const u64  b) { return u64x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   , (a.s4 |  b),    (a.s5 |  b)   , (a.s6 |  b),    (a.s7 |  b)   );  }
+inline __device__ u64x operator |  (const u64x a, const u64x b) { return u64x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3), (a.s4 |  b.s4), (a.s5 |  b.s5), (a.s6 |  b.s6), (a.s7 |  b.s7));  }
+
+inline __device__ u64x operator &  (const u64x a, const u64  b) { return u64x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   , (a.s4 &  b),    (a.s5 &  b)   , (a.s6 &  b),    (a.s7 &  b)   );  }
+inline __device__ u64x operator &  (const u64x a, const u64x b) { return u64x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3), (a.s4 &  b.s4), (a.s5 &  b.s5), (a.s6 &  b.s6), (a.s7 &  b.s7));  }
+
+inline __device__ u64x operator +  (const u64x a, const u64  b) { return u64x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   , (a.s4 +  b),    (a.s5 +  b)   , (a.s6 +  b),    (a.s7 +  b)   );  }
+inline __device__ u64x operator +  (const u64x a, const u64x b) { return u64x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3), (a.s4 +  b.s4), (a.s5 +  b.s5), (a.s6 +  b.s6), (a.s7 +  b.s7));  }
+
+inline __device__ u64x operator -  (const u64x a, const u64  b) { return u64x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   , (a.s4 -  b),    (a.s5 -  b)   , (a.s6 -  b),    (a.s7 -  b)   );  }
+inline __device__ u64x operator -  (const u64x a, const u64x b) { return u64x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3), (a.s4 -  b.s4), (a.s5 -  b.s5), (a.s6 -  b.s6), (a.s7 -  b.s7));  }
+
+inline __device__ u64x operator *  (const u64x a, const u64  b) { return u64x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   , (a.s4 *  b),    (a.s5 *  b)   , (a.s6 *  b),    (a.s7 *  b)   );  }
+inline __device__ u64x operator *  (const u64x a, const u64x b) { return u64x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3), (a.s4 *  b.s4), (a.s5 *  b.s5), (a.s6 *  b.s6), (a.s7 *  b.s7));  }
+
+inline __device__ u64x operator ~  (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7); }
+
+#endif
+
+#if VECT_SIZE == 16
+
+struct __device_builtin__ __builtin_align__(16) u8x
+{
+  u8 s0;
+  u8 s1;
+  u8 s2;
+  u8 s3;
+  u8 s4;
+  u8 s5;
+  u8 s6;
+  u8 s7;
+  u8 s8;
+  u8 s9;
+  u8 sa;
+  u8 sb;
+  u8 sc;
+  u8 sd;
+  u8 se;
+  u8 sf;
+
+  inline __device__  u8x (const u8 a, const u8 b, const u8 c, const u8 d, const u8 e, const u8 f, const u8 g, const u8 h, const u8 i, const u8 j, const u8 k, const u8 l, const u8 m, const u8 n, const u8 o, const u8 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
+  inline __device__  u8x (const u8 a)                                                                                                                                                                                     : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
+
+  inline __device__  u8x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0) { }
+  inline __device__ ~u8x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(32) u16x
+{
+  u16 s0;
+  u16 s1;
+  u16 s2;
+  u16 s3;
+  u16 s4;
+  u16 s5;
+  u16 s6;
+  u16 s7;
+  u16 s8;
+  u16 s9;
+  u16 sa;
+  u16 sb;
+  u16 sc;
+  u16 sd;
+  u16 se;
+  u16 sf;
+
+  inline __device__  u16x (const u16 a, const u16 b, const u16 c, const u16 d, const u16 e, const u16 f, const u16 g, const u16 h, const u16 i, const u16 j, const u16 k, const u16 l, const u16 m, const u16 n, const u16 o, const u16 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
+  inline __device__  u16x (const u16 a)                                                                                                                                                                                     : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
+
+  inline __device__  u16x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0){ }
+  inline __device__ ~u16x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(64) u32x
+{
+  u32 s0;
+  u32 s1;
+  u32 s2;
+  u32 s3;
+  u32 s4;
+  u32 s5;
+  u32 s6;
+  u32 s7;
+  u32 s8;
+  u32 s9;
+  u32 sa;
+  u32 sb;
+  u32 sc;
+  u32 sd;
+  u32 se;
+  u32 sf;
+
+  inline __device__  u32x (const u32 a, const u32 b, const u32 c, const u32 d, const u32 e, const u32 f, const u32 g, const u32 h, const u32 i, const u32 j, const u32 k, const u32 l, const u32 m, const u32 n, const u32 o, const u32 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
+  inline __device__  u32x (const u32 a)                                                                                                                                                                                     : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
+
+  inline __device__  u32x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0){ }
+  inline __device__ ~u32x (void) { }
+};
+
+struct __device_builtin__ __builtin_align__(128) u64x
+{
+  u64 s0;
+  u64 s1;
+  u64 s2;
+  u64 s3;
+  u64 s4;
+  u64 s5;
+  u64 s6;
+  u64 s7;
+  u64 s8;
+  u64 s9;
+  u64 sa;
+  u64 sb;
+  u64 sc;
+  u64 sd;
+  u64 se;
+  u64 sf;
+
+  inline __device__  u64x (const u64 a, const u64 b, const u64 c, const u64 d, const u64 e, const u64 f, const u64 g, const u64 h, const u64 i, const u64 j, const u64 k, const u64 l, const u64 m, const u64 n, const u64 o, const u64 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
+  inline __device__  u64x (const u64 a)                                                                                                                                                                                     : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
+
+  inline __device__  u64x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0) { }
+  inline __device__ ~u64x (void) { }
+};
+
+inline __device__ bool operator != (const u32x a, const u32  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)    && (a.s4 != b)    && (a.s5 != b)    && (a.s6 != b)    && (a.s7 != b)    && (a.s8 != b)    && (a.s9 != b)    && (a.sa != b)    && (a.sb != b)    && (a.sc != b)    && (a.sd != b)    && (a.se != b)    && (a.sf != b)   ); }
+inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7) && (a.s8 != b.s8) && (a.s9 != b.s9) && (a.sa != b.sa) && (a.sb != b.sb) && (a.sc != b.sc) && (a.sd != b.sd) && (a.se != b.se) && (a.sf != b.sf)); }
+
+inline __device__ void operator ^= (u32x &a, const u32  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;    a.s4 ^= b;    a.s5 ^= b;    a.s6 ^= b;    a.s7 ^= b;    a.s8 ^= b;    a.s9 ^= b;    a.sa ^= b;    a.sb ^= b;    a.sc ^= b;    a.sd ^= b;    a.se ^= b;    a.sf ^= b;    }
+inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; a.s8 ^= b.s8; a.s9 ^= b.s9; a.sa ^= b.sa; a.sb ^= b.sb; a.sc ^= b.sc; a.sd ^= b.sd; a.se ^= b.se; a.sf ^= b.sf; }
+
+inline __device__ void operator |= (u32x &a, const u32  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;    a.s4 |= b;    a.s5 |= b;    a.s6 |= b;    a.s7 |= b;    a.s8 |= b;    a.s9 |= b;    a.sa |= b;    a.sb |= b;    a.sc |= b;    a.sd |= b;    a.se |= b;    a.sf |= b;    }
+inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; a.s8 |= b.s8; a.s9 |= b.s9; a.sa |= b.sa; a.sb |= b.sb; a.sc |= b.sc; a.sd |= b.sd; a.se |= b.se; a.sf |= b.sf; }
+
+inline __device__ void operator &= (u32x &a, const u32  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;    a.s4 &= b;    a.s5 &= b;    a.s6 &= b;    a.s7 &= b;    a.s8 &= b;    a.s9 &= b;    a.sa &= b;    a.sb &= b;    a.sc &= b;    a.sd &= b;    a.se &= b;    a.sf &= b;    }
+inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; a.s8 &= b.s8; a.s9 &= b.s9; a.sa &= b.sa; a.sb &= b.sb; a.sc &= b.sc; a.sd &= b.sd; a.se &= b.se; a.sf &= b.sf; }
+
+inline __device__ void operator += (u32x &a, const u32  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;    a.s4 += b;    a.s5 += b;    a.s6 += b;    a.s7 += b;    a.s8 += b;    a.s9 += b;    a.sa += b;    a.sb += b;    a.sc += b;    a.sd += b;    a.se += b;    a.sf += b;    }
+inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; a.s8 += b.s8; a.s9 += b.s9; a.sa += b.sa; a.sb += b.sb; a.sc += b.sc; a.sd += b.sd; a.se += b.se; a.sf += b.sf; }
+
+inline __device__ void operator -= (u32x &a, const u32  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;    a.s4 -= b;    a.s5 -= b;    a.s6 -= b;    a.s7 -= b;    a.s8 -= b;    a.s9 -= b;    a.sa -= b;    a.sb -= b;    a.sc -= b;    a.sd -= b;    a.se -= b;    a.sf -= b;    }
+inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; a.s8 -= b.s8; a.s9 -= b.s9; a.sa -= b.sa; a.sb -= b.sb; a.sc -= b.sc; a.sd -= b.sd; a.se -= b.se; a.sf -= b.sf; }
+
+inline __device__ void operator *= (u32x &a, const u32  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;    a.s4 *= b;    a.s5 *= b;    a.s6 *= b;    a.s7 *= b;    a.s8 *= b;    a.s9 *= b;    a.sa *= b;    a.sb *= b;    a.sc *= b;    a.sd *= b;    a.se *= b;    a.sf *= b;    }
+inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; a.s8 *= b.s8; a.s9 *= b.s9; a.sa *= b.sa; a.sb *= b.sb; a.sc *= b.sc; a.sd *= b.sd; a.se *= b.se; a.sf *= b.sf; }
+
+inline __device__ void operator >>= (u32x &a, const u32  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;    a.s4 >>= b;    a.s5 >>= b;    a.s6 >>= b;    a.s7 >>= b;    a.s8 >>= b;    a.s9 >>= b;    a.sa >>= b;    a.sb >>= b;    a.sc >>= b;    a.sd >>= b;    a.se >>= b;    a.sf >>= b;    }
+inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; a.s8 >>= b.s8; a.s9 >>= b.s9; a.sa >>= b.sa; a.sb >>= b.sb; a.sc >>= b.sc; a.sd >>= b.sd; a.se >>= b.se; a.sf >>= b.sf; }
+
+inline __device__ void operator <<= (u32x &a, const u32  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;    a.s4 <<= b;    a.s5 <<= b;    a.s6 <<= b;    a.s7 <<= b;    a.s8 <<= b;    a.s9 <<= b;    a.sa <<= b;    a.sb <<= b;    a.sc <<= b;    a.sd <<= b;    a.se <<= b;    a.sf <<= b;    }
+inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; a.s8 <<= b.s8; a.s9 <<= b.s9; a.sa <<= b.sa; a.sb <<= b.sb; a.sc <<= b.sc; a.sd <<= b.sd; a.se <<= b.se; a.sf <<= b.sf; }
+
+inline __device__ u32x operator << (const u32x a, const u32  b) { return u32x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   , (a.s4 << b),    (a.s5 << b)   , (a.s6 << b),    (a.s7 << b),    (a.s8 << b),    (a.s9 << b)   , (a.sa << b),    (a.sb << b)   , (a.sc << b),    (a.sd << b)   , (a.se << b),    (a.sf << b)   );  }
+inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7), (a.s8 << b.s8), (a.s9 << b.s9), (a.sa << b.sa), (a.sb << b.sb), (a.sc << b.sc), (a.sd << b.sd), (a.se << b.se), (a.sf << b.sf));  }
+
+inline __device__ u32x operator >> (const u32x a, const u32  b) { return u32x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   , (a.s4 >> b),    (a.s5 >> b)   , (a.s6 >> b),    (a.s7 >> b),    (a.s8 >> b),    (a.s9 >> b)   , (a.sa >> b),    (a.sb >> b)   , (a.sc >> b),    (a.sd >> b)   , (a.se >> b),    (a.sf >> b)   );  }
+inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7), (a.s8 >> b.s8), (a.s9 >> b.s9), (a.sa >> b.sa), (a.sb >> b.sb), (a.sc >> b.sc), (a.sd >> b.sd), (a.se >> b.se), (a.sf >> b.sf));  }
+
+inline __device__ u32x operator ^  (const u32x a, const u32  b) { return u32x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   , (a.s4 ^  b),    (a.s5 ^  b)   , (a.s6 ^  b),    (a.s7 ^  b),    (a.s8 ^  b),    (a.s9 ^  b)   , (a.sa ^  b),    (a.sb ^  b)   , (a.sc ^  b),    (a.sd ^  b)   , (a.se ^  b),    (a.sf ^  b)   );  }
+inline __device__ u32x operator ^  (const u32x a, const u32x b) { return u32x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3), (a.s4 ^  b.s4), (a.s5 ^  b.s5), (a.s6 ^  b.s6), (a.s7 ^  b.s7), (a.s8 ^  b.s8), (a.s9 ^  b.s9), (a.sa ^  b.sa), (a.sb ^  b.sb), (a.sc ^  b.sc), (a.sd ^  b.sd), (a.se ^  b.se), (a.sf ^  b.sf));  }
+
+inline __device__ u32x operator |  (const u32x a, const u32  b) { return u32x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   , (a.s4 |  b),    (a.s5 |  b)   , (a.s6 |  b),    (a.s7 |  b),    (a.s8 |  b),    (a.s9 |  b)   , (a.sa |  b),    (a.sb |  b)   , (a.sc |  b),    (a.sd |  b)   , (a.se |  b),    (a.sf |  b)   );  }
+inline __device__ u32x operator |  (const u32x a, const u32x b) { return u32x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3), (a.s4 |  b.s4), (a.s5 |  b.s5), (a.s6 |  b.s6), (a.s7 |  b.s7), (a.s8 |  b.s8), (a.s9 |  b.s9), (a.sa |  b.sa), (a.sb |  b.sb), (a.sc |  b.sc), (a.sd |  b.sd), (a.se |  b.se), (a.sf |  b.sf));  }
+
+inline __device__ u32x operator &  (const u32x a, const u32  b) { return u32x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   , (a.s4 &  b),    (a.s5 &  b)   , (a.s6 &  b),    (a.s7 &  b),    (a.s8 &  b),    (a.s9 &  b)   , (a.sa &  b),    (a.sb &  b)   , (a.sc &  b),    (a.sd &  b)   , (a.se &  b),    (a.sf &  b)   );  }
+inline __device__ u32x operator &  (const u32x a, const u32x b) { return u32x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3), (a.s4 &  b.s4), (a.s5 &  b.s5), (a.s6 &  b.s6), (a.s7 &  b.s7), (a.s8 &  b.s8), (a.s9 &  b.s9), (a.sa &  b.sa), (a.sb &  b.sb), (a.sc &  b.sc), (a.sd &  b.sd), (a.se &  b.se), (a.sf &  b.sf));  }
+
+inline __device__ u32x operator +  (const u32x a, const u32  b) { return u32x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   , (a.s4 +  b),    (a.s5 +  b)   , (a.s6 +  b),    (a.s7 +  b),    (a.s8 +  b),    (a.s9 +  b)   , (a.sa +  b),    (a.sb +  b)   , (a.sc +  b),    (a.sd +  b)   , (a.se +  b),    (a.sf +  b)   );  }
+inline __device__ u32x operator +  (const u32x a, const u32x b) { return u32x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3), (a.s4 +  b.s4), (a.s5 +  b.s5), (a.s6 +  b.s6), (a.s7 +  b.s7), (a.s8 +  b.s8), (a.s9 +  b.s9), (a.sa +  b.sa), (a.sb +  b.sb), (a.sc +  b.sc), (a.sd +  b.sd), (a.se +  b.se), (a.sf +  b.sf));  }
+
+inline __device__ u32x operator -  (const u32x a, const u32  b) { return u32x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   , (a.s4 -  b),    (a.s5 -  b)   , (a.s6 -  b),    (a.s7 -  b),    (a.s8 -  b),    (a.s9 -  b)   , (a.sa -  b),    (a.sb -  b)   , (a.sc -  b),    (a.sd -  b)   , (a.se -  b),    (a.sf -  b)   );  }
+inline __device__ u32x operator -  (const u32x a, const u32x b) { return u32x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3), (a.s4 -  b.s4), (a.s5 -  b.s5), (a.s6 -  b.s6), (a.s7 -  b.s7), (a.s8 -  b.s8), (a.s9 -  b.s9), (a.sa -  b.sa), (a.sb -  b.sb), (a.sc -  b.sc), (a.sd -  b.sd), (a.se -  b.se), (a.sf -  b.sf));  }
+
+inline __device__ u32x operator *  (const u32x a, const u32  b) { return u32x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   , (a.s4 *  b),    (a.s5 *  b)   , (a.s6 *  b),    (a.s7 *  b),    (a.s8 *  b),    (a.s9 *  b)   , (a.sa *  b),    (a.sb *  b)   , (a.sc *  b),    (a.sd *  b)   , (a.se *  b),    (a.sf *  b)   );  }
+inline __device__ u32x operator *  (const u32x a, const u32x b) { return u32x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3), (a.s4 *  b.s4), (a.s5 *  b.s5), (a.s6 *  b.s6), (a.s7 *  b.s7), (a.s8 *  b.s8), (a.s9 *  b.s9), (a.sa *  b.sa), (a.sb *  b.sb), (a.sc *  b.sc), (a.sd *  b.sd), (a.se *  b.se), (a.sf *  b.sf));  }
+
+inline __device__ u32x operator ~  (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7, ~a.s8, ~a.s9, ~a.sa, ~a.sb, ~a.sc, ~a.sd, ~a.se, ~a.sf); }
+
+inline __device__ bool operator != (const u64x a, const u64  b) { return ((a.s0 != b)    && (a.s1 != b)    && (a.s2 != b)    && (a.s3 != b)    && (a.s4 != b)    && (a.s5 != b)    && (a.s6 != b)    && (a.s7 != b)    && (a.s8 != b)    && (a.s9 != b)    && (a.sa != b)    && (a.sb != b)    && (a.sc != b)    && (a.sd != b)    && (a.se != b)    && (a.sf != b)   ); }
+inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7) && (a.s8 != b.s8) && (a.s9 != b.s9) && (a.sa != b.sa) && (a.sb != b.sb) && (a.sc != b.sc) && (a.sd != b.sd) && (a.se != b.se) && (a.sf != b.sf)); }
+
+inline __device__ void operator ^= (u64x &a, const u64  b) { a.s0 ^= b;    a.s1 ^= b;    a.s2 ^= b;    a.s3 ^= b;    a.s4 ^= b;    a.s5 ^= b;    a.s6 ^= b;    a.s7 ^= b;    a.s8 ^= b;    a.s9 ^= b;    a.sa ^= b;    a.sb ^= b;    a.sc ^= b;    a.sd ^= b;    a.se ^= b;    a.sf ^= b;    }
+inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; a.s8 ^= b.s8; a.s9 ^= b.s9; a.sa ^= b.sa; a.sb ^= b.sb; a.sc ^= b.sc; a.sd ^= b.sd; a.se ^= b.se; a.sf ^= b.sf; }
+
+inline __device__ void operator |= (u64x &a, const u64  b) { a.s0 |= b;    a.s1 |= b;    a.s2 |= b;    a.s3 |= b;    a.s4 |= b;    a.s5 |= b;    a.s6 |= b;    a.s7 |= b;    a.s8 |= b;    a.s9 |= b;    a.sa |= b;    a.sb |= b;    a.sc |= b;    a.sd |= b;    a.se |= b;    a.sf |= b;    }
+inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; a.s8 |= b.s8; a.s9 |= b.s9; a.sa |= b.sa; a.sb |= b.sb; a.sc |= b.sc; a.sd |= b.sd; a.se |= b.se; a.sf |= b.sf; }
+
+inline __device__ void operator &= (u64x &a, const u64  b) { a.s0 &= b;    a.s1 &= b;    a.s2 &= b;    a.s3 &= b;    a.s4 &= b;    a.s5 &= b;    a.s6 &= b;    a.s7 &= b;    a.s8 &= b;    a.s9 &= b;    a.sa &= b;    a.sb &= b;    a.sc &= b;    a.sd &= b;    a.se &= b;    a.sf &= b;    }
+inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; a.s8 &= b.s8; a.s9 &= b.s9; a.sa &= b.sa; a.sb &= b.sb; a.sc &= b.sc; a.sd &= b.sd; a.se &= b.se; a.sf &= b.sf; }
+
+inline __device__ void operator += (u64x &a, const u64  b) { a.s0 += b;    a.s1 += b;    a.s2 += b;    a.s3 += b;    a.s4 += b;    a.s5 += b;    a.s6 += b;    a.s7 += b;    a.s8 += b;    a.s9 += b;    a.sa += b;    a.sb += b;    a.sc += b;    a.sd += b;    a.se += b;    a.sf += b;    }
+inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; a.s8 += b.s8; a.s9 += b.s9; a.sa += b.sa; a.sb += b.sb; a.sc += b.sc; a.sd += b.sd; a.se += b.se; a.sf += b.sf; }
+
+inline __device__ void operator -= (u64x &a, const u64  b) { a.s0 -= b;    a.s1 -= b;    a.s2 -= b;    a.s3 -= b;    a.s4 -= b;    a.s5 -= b;    a.s6 -= b;    a.s7 -= b;    a.s8 -= b;    a.s9 -= b;    a.sa -= b;    a.sb -= b;    a.sc -= b;    a.sd -= b;    a.se -= b;    a.sf -= b;    }
+inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; a.s8 -= b.s8; a.s9 -= b.s9; a.sa -= b.sa; a.sb -= b.sb; a.sc -= b.sc; a.sd -= b.sd; a.se -= b.se; a.sf -= b.sf; }
+
+inline __device__ void operator *= (u64x &a, const u64  b) { a.s0 *= b;    a.s1 *= b;    a.s2 *= b;    a.s3 *= b;    a.s4 *= b;    a.s5 *= b;    a.s6 *= b;    a.s7 *= b;    a.s8 *= b;    a.s9 *= b;    a.sa *= b;    a.sb *= b;    a.sc *= b;    a.sd *= b;    a.se *= b;    a.sf *= b;    }
+inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; a.s8 *= b.s8; a.s9 *= b.s9; a.sa *= b.sa; a.sb *= b.sb; a.sc *= b.sc; a.sd *= b.sd; a.se *= b.se; a.sf *= b.sf; }
+
+inline __device__ void operator >>= (u64x &a, const u64  b) { a.s0 >>= b;    a.s1 >>= b;    a.s2 >>= b;    a.s3 >>= b;    a.s4 >>= b;    a.s5 >>= b;    a.s6 >>= b;    a.s7 >>= b;    a.s8 >>= b;    a.s9 >>= b;    a.sa >>= b;    a.sb >>= b;    a.sc >>= b;    a.sd >>= b;    a.se >>= b;    a.sf >>= b;    }
+inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; a.s8 >>= b.s8; a.s9 >>= b.s9; a.sa >>= b.sa; a.sb >>= b.sb; a.sc >>= b.sc; a.sd >>= b.sd; a.se >>= b.se; a.sf >>= b.sf; }
+
+inline __device__ void operator <<= (u64x &a, const u64  b) { a.s0 <<= b;    a.s1 <<= b;    a.s2 <<= b;    a.s3 <<= b;    a.s4 <<= b;    a.s5 <<= b;    a.s6 <<= b;    a.s7 <<= b;    a.s8 <<= b;    a.s9 <<= b;    a.sa <<= b;    a.sb <<= b;    a.sc <<= b;    a.sd <<= b;    a.se <<= b;    a.sf <<= b;    }
+inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; a.s8 <<= b.s8; a.s9 <<= b.s9; a.sa <<= b.sa; a.sb <<= b.sb; a.sc <<= b.sc; a.sd <<= b.sd; a.se <<= b.se; a.sf <<= b.sf; }
+
+inline __device__ u64x operator << (const u64x a, const u64  b) { return u64x ((a.s0 << b),    (a.s1 << b)   , (a.s2 << b),    (a.s3 << b)   , (a.s4 << b),    (a.s5 << b)   , (a.s6 << b),    (a.s7 << b),    (a.s8 << b),    (a.s9 << b)   , (a.sa << b),    (a.sb << b)   , (a.sc << b),    (a.sd << b)   , (a.se << b),    (a.sf << b)   );  }
+inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7), (a.s8 << b.s8), (a.s9 << b.s9), (a.sa << b.sa), (a.sb << b.sb), (a.sc << b.sc), (a.sd << b.sd), (a.se << b.se), (a.sf << b.sf));  }
+
+inline __device__ u64x operator >> (const u64x a, const u64  b) { return u64x ((a.s0 >> b),    (a.s1 >> b)   , (a.s2 >> b),    (a.s3 >> b)   , (a.s4 >> b),    (a.s5 >> b)   , (a.s6 >> b),    (a.s7 >> b),    (a.s8 >> b),    (a.s9 >> b)   , (a.sa >> b),    (a.sb >> b)   , (a.sc >> b),    (a.sd >> b)   , (a.se >> b),    (a.sf >> b)   );  }
+inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7), (a.s8 >> b.s8), (a.s9 >> b.s9), (a.sa >> b.sa), (a.sb >> b.sb), (a.sc >> b.sc), (a.sd >> b.sd), (a.se >> b.se), (a.sf >> b.sf));  }
+
+inline __device__ u64x operator ^  (const u64x a, const u64  b) { return u64x ((a.s0 ^  b),    (a.s1 ^  b)   , (a.s2 ^  b),    (a.s3 ^  b)   , (a.s4 ^  b),    (a.s5 ^  b)   , (a.s6 ^  b),    (a.s7 ^  b),    (a.s8 ^  b),    (a.s9 ^  b)   , (a.sa ^  b),    (a.sb ^  b)   , (a.sc ^  b),    (a.sd ^  b)   , (a.se ^  b),    (a.sf ^  b)   );  }
+inline __device__ u64x operator ^  (const u64x a, const u64x b) { return u64x ((a.s0 ^  b.s0), (a.s1 ^  b.s1), (a.s2 ^  b.s2), (a.s3 ^  b.s3), (a.s4 ^  b.s4), (a.s5 ^  b.s5), (a.s6 ^  b.s6), (a.s7 ^  b.s7), (a.s8 ^  b.s8), (a.s9 ^  b.s9), (a.sa ^  b.sa), (a.sb ^  b.sb), (a.sc ^  b.sc), (a.sd ^  b.sd), (a.se ^  b.se), (a.sf ^  b.sf));  }
+
+inline __device__ u64x operator |  (const u64x a, const u64  b) { return u64x ((a.s0 |  b),    (a.s1 |  b)   , (a.s2 |  b),    (a.s3 |  b)   , (a.s4 |  b),    (a.s5 |  b)   , (a.s6 |  b),    (a.s7 |  b),    (a.s8 |  b),    (a.s9 |  b)   , (a.sa |  b),    (a.sb |  b)   , (a.sc |  b),    (a.sd |  b)   , (a.se |  b),    (a.sf |  b)   );  }
+inline __device__ u64x operator |  (const u64x a, const u64x b) { return u64x ((a.s0 |  b.s0), (a.s1 |  b.s1), (a.s2 |  b.s2), (a.s3 |  b.s3), (a.s4 |  b.s4), (a.s5 |  b.s5), (a.s6 |  b.s6), (a.s7 |  b.s7), (a.s8 |  b.s8), (a.s9 |  b.s9), (a.sa |  b.sa), (a.sb |  b.sb), (a.sc |  b.sc), (a.sd |  b.sd), (a.se |  b.se), (a.sf |  b.sf));  }
+
+inline __device__ u64x operator &  (const u64x a, const u64  b) { return u64x ((a.s0 &  b),    (a.s1 &  b)   , (a.s2 &  b),    (a.s3 &  b)   , (a.s4 &  b),    (a.s5 &  b)   , (a.s6 &  b),    (a.s7 &  b),    (a.s8 &  b),    (a.s9 &  b)   , (a.sa &  b),    (a.sb &  b)   , (a.sc &  b),    (a.sd &  b)   , (a.se &  b),    (a.sf &  b)   );  }
+inline __device__ u64x operator &  (const u64x a, const u64x b) { return u64x ((a.s0 &  b.s0), (a.s1 &  b.s1), (a.s2 &  b.s2), (a.s3 &  b.s3), (a.s4 &  b.s4), (a.s5 &  b.s5), (a.s6 &  b.s6), (a.s7 &  b.s7), (a.s8 &  b.s8), (a.s9 &  b.s9), (a.sa &  b.sa), (a.sb &  b.sb), (a.sc &  b.sc), (a.sd &  b.sd), (a.se &  b.se), (a.sf &  b.sf));  }
+
+inline __device__ u64x operator +  (const u64x a, const u64  b) { return u64x ((a.s0 +  b),    (a.s1 +  b)   , (a.s2 +  b),    (a.s3 +  b)   , (a.s4 +  b),    (a.s5 +  b)   , (a.s6 +  b),    (a.s7 +  b),    (a.s8 +  b),    (a.s9 +  b)   , (a.sa +  b),    (a.sb +  b)   , (a.sc +  b),    (a.sd +  b)   , (a.se +  b),    (a.sf +  b)   );  }
+inline __device__ u64x operator +  (const u64x a, const u64x b) { return u64x ((a.s0 +  b.s0), (a.s1 +  b.s1), (a.s2 +  b.s2), (a.s3 +  b.s3), (a.s4 +  b.s4), (a.s5 +  b.s5), (a.s6 +  b.s6), (a.s7 +  b.s7), (a.s8 +  b.s8), (a.s9 +  b.s9), (a.sa +  b.sa), (a.sb +  b.sb), (a.sc +  b.sc), (a.sd +  b.sd), (a.se +  b.se), (a.sf +  b.sf));  }
+
+inline __device__ u64x operator -  (const u64x a, const u64  b) { return u64x ((a.s0 -  b),    (a.s1 -  b)   , (a.s2 -  b),    (a.s3 -  b)   , (a.s4 -  b),    (a.s5 -  b)   , (a.s6 -  b),    (a.s7 -  b),    (a.s8 -  b),    (a.s9 -  b)   , (a.sa -  b),    (a.sb -  b)   , (a.sc -  b),    (a.sd -  b)   , (a.se -  b),    (a.sf -  b)   );  }
+inline __device__ u64x operator -  (const u64x a, const u64x b) { return u64x ((a.s0 -  b.s0), (a.s1 -  b.s1), (a.s2 -  b.s2), (a.s3 -  b.s3), (a.s4 -  b.s4), (a.s5 -  b.s5), (a.s6 -  b.s6), (a.s7 -  b.s7), (a.s8 -  b.s8), (a.s9 -  b.s9), (a.sa -  b.sa), (a.sb -  b.sb), (a.sc -  b.sc), (a.sd -  b.sd), (a.se -  b.se), (a.sf -  b.sf));  }
+
+inline __device__ u64x operator *  (const u64x a, const u64  b) { return u64x ((a.s0 *  b),    (a.s1 *  b)   , (a.s2 *  b),    (a.s3 *  b)   , (a.s4 *  b),    (a.s5 *  b)   , (a.s6 *  b),    (a.s7 *  b),    (a.s8 *  b),    (a.s9 *  b)   , (a.sa *  b),    (a.sb *  b)   , (a.sc *  b),    (a.sd *  b)   , (a.se *  b),    (a.sf *  b)   );  }
+inline __device__ u64x operator *  (const u64x a, const u64x b) { return u64x ((a.s0 *  b.s0), (a.s1 *  b.s1), (a.s2 *  b.s2), (a.s3 *  b.s3), (a.s4 *  b.s4), (a.s5 *  b.s5), (a.s6 *  b.s6), (a.s7 *  b.s7), (a.s8 *  b.s8), (a.s9 *  b.s9), (a.sa *  b.sa), (a.sb *  b.sb), (a.sc *  b.sc), (a.sd *  b.sd), (a.se *  b.se), (a.sf *  b.sf));  }
+
+inline __device__ u64x operator ~  (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7, ~a.s8, ~a.s9, ~a.sa, ~a.sb, ~a.sc, ~a.sd, ~a.se, ~a.sf); }
+
+#endif
+
+typedef __device_builtin__ struct u8x  u8x;
+typedef __device_builtin__ struct u16x u16x;
+typedef __device_builtin__ struct u32x u32x;
+typedef __device_builtin__ struct u64x u64x;
+
+#define make_u8x  u8x
+#define make_u16x u16x
+#define make_u32x u32x
+#define make_u64x u64x
+
 #else
 typedef VTYPE(uchar,  VECT_SIZE)  u8x;
 typedef VTYPE(ushort, VECT_SIZE) u16x;
 typedef VTYPE(uint,   VECT_SIZE) u32x;
 typedef VTYPE(ulong,  VECT_SIZE) u64x;
+
+#define make_u8x  (u8x)
+#define make_u16x (u16x)
+#define make_u32x (u32x)
+#define make_u64x (u64x)
+
+#endif
 #endif
 
 // unions
@@ -58,17 +812,19 @@ typedef union vconv32
 
   struct
   {
-    u16 v16a;
-    u16 v16b;
-  };
+    u16 a;
+    u16 b;
+
+  } v16;
 
   struct
   {
-    u8 v8a;
-    u8 v8b;
-    u8 v8c;
-    u8 v8d;
-  };
+    u8 a;
+    u8 b;
+    u8 c;
+    u8 d;
+
+  } v8;
 
 } vconv32_t;
 
@@ -78,29 +834,32 @@ typedef union vconv64
 
   struct
   {
-    u32 v32a;
-    u32 v32b;
-  };
+    u32 a;
+    u32 b;
+
+  } v32;
 
   struct
   {
-    u16 v16a;
-    u16 v16b;
-    u16 v16c;
-    u16 v16d;
-  };
+    u16 a;
+    u16 b;
+    u16 c;
+    u16 d;
+
+  } v16;
 
   struct
   {
-    u8 v8a;
-    u8 v8b;
-    u8 v8c;
-    u8 v8d;
-    u8 v8e;
-    u8 v8f;
-    u8 v8g;
-    u8 v8h;
-  };
+    u8 a;
+    u8 b;
+    u8 c;
+    u8 d;
+    u8 e;
+    u8 f;
+    u8 g;
+    u8 h;
+
+  } v8;
 
 } vconv64_t;
 
diff --git a/OpenCL/inc_vendor.h b/OpenCL/inc_vendor.h
index ba85cbc74..f2f201e19 100644
--- a/OpenCL/inc_vendor.h
+++ b/OpenCL/inc_vendor.h
@@ -6,14 +6,33 @@
 #ifndef _INC_VENDOR_H
 #define _INC_VENDOR_H
 
-#ifdef _CPU_OPENCL_EMU_H
+#if defined _CPU_OPENCL_EMU_H
+#define IS_NATIVE
+#elif defined __CUDACC__
+#define IS_CUDA
+#else
+#define IS_OPENCL
+#endif
+
+#if defined IS_NATIVE
+#define CONSTANT_VK
 #define CONSTANT_AS
 #define GLOBAL_AS
+#define LOCAL_VK
 #define LOCAL_AS
 #define KERNEL_FQ
-#else
+#elif defined IS_CUDA
+#define CONSTANT_VK __constant__
+#define CONSTANT_AS
+#define GLOBAL_AS
+#define LOCAL_VK    __shared__
+#define LOCAL_AS
+#define KERNEL_FQ   extern "C" __global__
+#elif defined IS_OPENCL
+#define CONSTANT_VK __constant
 #define CONSTANT_AS __constant
 #define GLOBAL_AS   __global
+#define LOCAL_VK    __local
 #define LOCAL_AS    __local
 #define KERNEL_FQ   __kernel
 #endif
diff --git a/OpenCL/inc_veracrypt_xts.cl b/OpenCL/inc_veracrypt_xts.cl
index 8709db662..ba3b7eef5 100644
--- a/OpenCL/inc_veracrypt_xts.cl
+++ b/OpenCL/inc_veracrypt_xts.cl
@@ -5,6 +5,7 @@
 
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_cipher_aes.h"
 #include "inc_cipher_serpent.h"
diff --git a/OpenCL/m00000_a0-optimized.cl b/OpenCL/m00000_a0-optimized.cl
index c7c4fec28..da224b637 100644
--- a/OpenCL/m00000_a0-optimized.cl
+++ b/OpenCL/m00000_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00000_a0-pure.cl b/OpenCL/m00000_a0-pure.cl
index 2ed90f5c1..ea2699153 100644
--- a/OpenCL/m00000_a0-pure.cl
+++ b/OpenCL/m00000_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00000_a1-optimized.cl b/OpenCL/m00000_a1-optimized.cl
index 50fa4a568..4b49148a2 100644
--- a/OpenCL/m00000_a1-optimized.cl
+++ b/OpenCL/m00000_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_simd.cl"
diff --git a/OpenCL/m00000_a1-pure.cl b/OpenCL/m00000_a1-pure.cl
index eacb3910d..5bbb281c8 100644
--- a/OpenCL/m00000_a1-pure.cl
+++ b/OpenCL/m00000_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00000_a3-optimized.cl b/OpenCL/m00000_a3-optimized.cl
index 72403afb8..1a7fb4f9a 100644
--- a/OpenCL/m00000_a3-optimized.cl
+++ b/OpenCL/m00000_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -16,7 +17,7 @@
 #define MD5_STEP_REV(f,a,b,c,d,x,t,s)   \
 {                                       \
   a -= b;                               \
-  a  = hc_rotr32_S (a, s);                 \
+  a  = hc_rotr32_S (a, s);              \
   a -= f (b, c, d);                     \
   a -= x;                               \
   a -= t;                               \
@@ -25,7 +26,7 @@
 #define MD5_STEP_REV1(f,a,b,c,d,x,t,s)  \
 {                                       \
   a -= b;                               \
-  a  = hc_rotr32_S (a, s);                 \
+  a  = hc_rotr32_S (a, s);              \
   a -= x;                               \
   a -= t;                               \
 }
diff --git a/OpenCL/m00000_a3-pure.cl b/OpenCL/m00000_a3-pure.cl
index ee6072389..9ff74dcc3 100644
--- a/OpenCL/m00000_a3-pure.cl
+++ b/OpenCL/m00000_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00010_a0-optimized.cl b/OpenCL/m00010_a0-optimized.cl
index 7859648fb..e735558df 100644
--- a/OpenCL/m00010_a0-optimized.cl
+++ b/OpenCL/m00010_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00010_a0-pure.cl b/OpenCL/m00010_a0-pure.cl
index 1b9765de2..047a01c9a 100644
--- a/OpenCL/m00010_a0-pure.cl
+++ b/OpenCL/m00010_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00010_a1-optimized.cl b/OpenCL/m00010_a1-optimized.cl
index 14e5ab986..ece0ac663 100644
--- a/OpenCL/m00010_a1-optimized.cl
+++ b/OpenCL/m00010_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00010_a1-pure.cl b/OpenCL/m00010_a1-pure.cl
index fbc397d7d..14f0c9271 100644
--- a/OpenCL/m00010_a1-pure.cl
+++ b/OpenCL/m00010_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00010_a3-optimized.cl b/OpenCL/m00010_a3-optimized.cl
index 0367d7cfe..a101428b1 100644
--- a/OpenCL/m00010_a3-optimized.cl
+++ b/OpenCL/m00010_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00010_a3-pure.cl b/OpenCL/m00010_a3-pure.cl
index 362207256..caa63b0b5 100644
--- a/OpenCL/m00010_a3-pure.cl
+++ b/OpenCL/m00010_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00020_a0-optimized.cl b/OpenCL/m00020_a0-optimized.cl
index 9e1445335..7becd0173 100644
--- a/OpenCL/m00020_a0-optimized.cl
+++ b/OpenCL/m00020_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00020_a0-pure.cl b/OpenCL/m00020_a0-pure.cl
index ee18c5f6b..43eb1158b 100644
--- a/OpenCL/m00020_a0-pure.cl
+++ b/OpenCL/m00020_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00020_a1-optimized.cl b/OpenCL/m00020_a1-optimized.cl
index dc37febfd..536b9ba16 100644
--- a/OpenCL/m00020_a1-optimized.cl
+++ b/OpenCL/m00020_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00020_a1-pure.cl b/OpenCL/m00020_a1-pure.cl
index 10b96807d..818d352aa 100644
--- a/OpenCL/m00020_a1-pure.cl
+++ b/OpenCL/m00020_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00020_a3-optimized.cl b/OpenCL/m00020_a3-optimized.cl
index 1dc9be105..939ce5710 100644
--- a/OpenCL/m00020_a3-optimized.cl
+++ b/OpenCL/m00020_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00020_a3-pure.cl b/OpenCL/m00020_a3-pure.cl
index d937f3f82..5cc9805f4 100644
--- a/OpenCL/m00020_a3-pure.cl
+++ b/OpenCL/m00020_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00030_a0-optimized.cl b/OpenCL/m00030_a0-optimized.cl
index 7b9cd9516..f6ba857a3 100644
--- a/OpenCL/m00030_a0-optimized.cl
+++ b/OpenCL/m00030_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00030_a0-pure.cl b/OpenCL/m00030_a0-pure.cl
index ac7ebbd6d..22f4ae6a7 100644
--- a/OpenCL/m00030_a0-pure.cl
+++ b/OpenCL/m00030_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00030_a1-optimized.cl b/OpenCL/m00030_a1-optimized.cl
index f67c54811..e40e653db 100644
--- a/OpenCL/m00030_a1-optimized.cl
+++ b/OpenCL/m00030_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00030_a1-pure.cl b/OpenCL/m00030_a1-pure.cl
index ac7582c66..fb974b0a2 100644
--- a/OpenCL/m00030_a1-pure.cl
+++ b/OpenCL/m00030_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00030_a3-optimized.cl b/OpenCL/m00030_a3-optimized.cl
index a1505af17..3c5a808f9 100644
--- a/OpenCL/m00030_a3-optimized.cl
+++ b/OpenCL/m00030_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00030_a3-pure.cl b/OpenCL/m00030_a3-pure.cl
index dedf84f53..053896651 100644
--- a/OpenCL/m00030_a3-pure.cl
+++ b/OpenCL/m00030_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00040_a0-optimized.cl b/OpenCL/m00040_a0-optimized.cl
index 87a03f7de..d04315627 100644
--- a/OpenCL/m00040_a0-optimized.cl
+++ b/OpenCL/m00040_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00040_a0-pure.cl b/OpenCL/m00040_a0-pure.cl
index d13e699e0..ef4f774f0 100644
--- a/OpenCL/m00040_a0-pure.cl
+++ b/OpenCL/m00040_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00040_a1-optimized.cl b/OpenCL/m00040_a1-optimized.cl
index 10031fee8..c5e5d3002 100644
--- a/OpenCL/m00040_a1-optimized.cl
+++ b/OpenCL/m00040_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00040_a1-pure.cl b/OpenCL/m00040_a1-pure.cl
index e32823a40..842ac0fd8 100644
--- a/OpenCL/m00040_a1-pure.cl
+++ b/OpenCL/m00040_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00040_a3-optimized.cl b/OpenCL/m00040_a3-optimized.cl
index bdfa5f16c..bb9bc38a6 100644
--- a/OpenCL/m00040_a3-optimized.cl
+++ b/OpenCL/m00040_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00040_a3-pure.cl b/OpenCL/m00040_a3-pure.cl
index 4da1b7438..ca7d7b843 100644
--- a/OpenCL/m00040_a3-pure.cl
+++ b/OpenCL/m00040_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00050_a0-optimized.cl b/OpenCL/m00050_a0-optimized.cl
index 944006caa..f94854ed8 100644
--- a/OpenCL/m00050_a0-optimized.cl
+++ b/OpenCL/m00050_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00050_a0-pure.cl b/OpenCL/m00050_a0-pure.cl
index 38fba8341..8ad06fa3b 100644
--- a/OpenCL/m00050_a0-pure.cl
+++ b/OpenCL/m00050_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00050_a1-optimized.cl b/OpenCL/m00050_a1-optimized.cl
index 6cebd87a4..650aec269 100644
--- a/OpenCL/m00050_a1-optimized.cl
+++ b/OpenCL/m00050_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00050_a1-pure.cl b/OpenCL/m00050_a1-pure.cl
index 927d48fcb..0cd0bd0f0 100644
--- a/OpenCL/m00050_a1-pure.cl
+++ b/OpenCL/m00050_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00050_a3-optimized.cl b/OpenCL/m00050_a3-optimized.cl
index a19259f1f..f3d733ceb 100644
--- a/OpenCL/m00050_a3-optimized.cl
+++ b/OpenCL/m00050_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00050_a3-pure.cl b/OpenCL/m00050_a3-pure.cl
index 3e0adaee3..693d3862f 100644
--- a/OpenCL/m00050_a3-pure.cl
+++ b/OpenCL/m00050_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00060_a0-optimized.cl b/OpenCL/m00060_a0-optimized.cl
index e9656ced9..7546e66d2 100644
--- a/OpenCL/m00060_a0-optimized.cl
+++ b/OpenCL/m00060_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00060_a0-pure.cl b/OpenCL/m00060_a0-pure.cl
index 9efc3ea1e..dfa8c6205 100644
--- a/OpenCL/m00060_a0-pure.cl
+++ b/OpenCL/m00060_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00060_a1-optimized.cl b/OpenCL/m00060_a1-optimized.cl
index 25159ee25..2bc59801c 100644
--- a/OpenCL/m00060_a1-optimized.cl
+++ b/OpenCL/m00060_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00060_a1-pure.cl b/OpenCL/m00060_a1-pure.cl
index fa247bfc8..db7376a83 100644
--- a/OpenCL/m00060_a1-pure.cl
+++ b/OpenCL/m00060_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00060_a3-optimized.cl b/OpenCL/m00060_a3-optimized.cl
index b46767385..1e051b965 100644
--- a/OpenCL/m00060_a3-optimized.cl
+++ b/OpenCL/m00060_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00060_a3-pure.cl b/OpenCL/m00060_a3-pure.cl
index c44f0ea10..73f2302a9 100644
--- a/OpenCL/m00060_a3-pure.cl
+++ b/OpenCL/m00060_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00100_a0-optimized.cl b/OpenCL/m00100_a0-optimized.cl
index 1f91fc49a..70363b391 100644
--- a/OpenCL/m00100_a0-optimized.cl
+++ b/OpenCL/m00100_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00100_a0-pure.cl b/OpenCL/m00100_a0-pure.cl
index 3b6068af9..82e4d7b62 100644
--- a/OpenCL/m00100_a0-pure.cl
+++ b/OpenCL/m00100_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00100_a1-optimized.cl b/OpenCL/m00100_a1-optimized.cl
index 2dade3f9a..ab46a7c10 100644
--- a/OpenCL/m00100_a1-optimized.cl
+++ b/OpenCL/m00100_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00100_a1-pure.cl b/OpenCL/m00100_a1-pure.cl
index 2f0114d61..807ddc5ce 100644
--- a/OpenCL/m00100_a1-pure.cl
+++ b/OpenCL/m00100_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00100_a3-optimized.cl b/OpenCL/m00100_a3-optimized.cl
index 7584036a7..6e01fc490 100644
--- a/OpenCL/m00100_a3-optimized.cl
+++ b/OpenCL/m00100_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00100_a3-pure.cl b/OpenCL/m00100_a3-pure.cl
index 881f45ee4..32189c46f 100644
--- a/OpenCL/m00100_a3-pure.cl
+++ b/OpenCL/m00100_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00110_a0-optimized.cl b/OpenCL/m00110_a0-optimized.cl
index 475409428..d210c5f11 100644
--- a/OpenCL/m00110_a0-optimized.cl
+++ b/OpenCL/m00110_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00110_a0-pure.cl b/OpenCL/m00110_a0-pure.cl
index 5103f8a56..1e3cc3c58 100644
--- a/OpenCL/m00110_a0-pure.cl
+++ b/OpenCL/m00110_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00110_a1-optimized.cl b/OpenCL/m00110_a1-optimized.cl
index ec9810df8..57a87affd 100644
--- a/OpenCL/m00110_a1-optimized.cl
+++ b/OpenCL/m00110_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00110_a1-pure.cl b/OpenCL/m00110_a1-pure.cl
index 294def602..ca1e2285d 100644
--- a/OpenCL/m00110_a1-pure.cl
+++ b/OpenCL/m00110_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00110_a3-optimized.cl b/OpenCL/m00110_a3-optimized.cl
index 9dcc6624c..187f5b658 100644
--- a/OpenCL/m00110_a3-optimized.cl
+++ b/OpenCL/m00110_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00110_a3-pure.cl b/OpenCL/m00110_a3-pure.cl
index c17b03694..8f3662640 100644
--- a/OpenCL/m00110_a3-pure.cl
+++ b/OpenCL/m00110_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00120_a0-optimized.cl b/OpenCL/m00120_a0-optimized.cl
index 75527d42f..18c94f7be 100644
--- a/OpenCL/m00120_a0-optimized.cl
+++ b/OpenCL/m00120_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00120_a0-pure.cl b/OpenCL/m00120_a0-pure.cl
index 653e2d0f8..3af320dfa 100644
--- a/OpenCL/m00120_a0-pure.cl
+++ b/OpenCL/m00120_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00120_a1-optimized.cl b/OpenCL/m00120_a1-optimized.cl
index 486297953..f5c28c971 100644
--- a/OpenCL/m00120_a1-optimized.cl
+++ b/OpenCL/m00120_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00120_a1-pure.cl b/OpenCL/m00120_a1-pure.cl
index c2c5d1f59..ef1256f90 100644
--- a/OpenCL/m00120_a1-pure.cl
+++ b/OpenCL/m00120_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00120_a3-optimized.cl b/OpenCL/m00120_a3-optimized.cl
index 2864927bd..4e57defe7 100644
--- a/OpenCL/m00120_a3-optimized.cl
+++ b/OpenCL/m00120_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00120_a3-pure.cl b/OpenCL/m00120_a3-pure.cl
index 6d5cf8c7c..d5bc699d3 100644
--- a/OpenCL/m00120_a3-pure.cl
+++ b/OpenCL/m00120_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00130_a0-optimized.cl b/OpenCL/m00130_a0-optimized.cl
index becbbbc52..2712d0b23 100644
--- a/OpenCL/m00130_a0-optimized.cl
+++ b/OpenCL/m00130_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00130_a0-pure.cl b/OpenCL/m00130_a0-pure.cl
index 9a8493ffa..b9214191f 100644
--- a/OpenCL/m00130_a0-pure.cl
+++ b/OpenCL/m00130_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00130_a1-optimized.cl b/OpenCL/m00130_a1-optimized.cl
index 64b2505f7..24d30df82 100644
--- a/OpenCL/m00130_a1-optimized.cl
+++ b/OpenCL/m00130_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00130_a1-pure.cl b/OpenCL/m00130_a1-pure.cl
index 69f9b7de2..a042b6a68 100644
--- a/OpenCL/m00130_a1-pure.cl
+++ b/OpenCL/m00130_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00130_a3-optimized.cl b/OpenCL/m00130_a3-optimized.cl
index c8dd293a8..28bbbb5e2 100644
--- a/OpenCL/m00130_a3-optimized.cl
+++ b/OpenCL/m00130_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00130_a3-pure.cl b/OpenCL/m00130_a3-pure.cl
index 00e20212c..d86d1b541 100644
--- a/OpenCL/m00130_a3-pure.cl
+++ b/OpenCL/m00130_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00140_a0-optimized.cl b/OpenCL/m00140_a0-optimized.cl
index 32eeea008..52da035e7 100644
--- a/OpenCL/m00140_a0-optimized.cl
+++ b/OpenCL/m00140_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00140_a0-pure.cl b/OpenCL/m00140_a0-pure.cl
index d1a14354f..54a7f92c8 100644
--- a/OpenCL/m00140_a0-pure.cl
+++ b/OpenCL/m00140_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00140_a1-optimized.cl b/OpenCL/m00140_a1-optimized.cl
index 1019236d7..bbe818983 100644
--- a/OpenCL/m00140_a1-optimized.cl
+++ b/OpenCL/m00140_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00140_a1-pure.cl b/OpenCL/m00140_a1-pure.cl
index 902636b85..e1a820a26 100644
--- a/OpenCL/m00140_a1-pure.cl
+++ b/OpenCL/m00140_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00140_a3-optimized.cl b/OpenCL/m00140_a3-optimized.cl
index 8db77efc5..3693569a1 100644
--- a/OpenCL/m00140_a3-optimized.cl
+++ b/OpenCL/m00140_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00140_a3-pure.cl b/OpenCL/m00140_a3-pure.cl
index f68773c30..a6c7a12fd 100644
--- a/OpenCL/m00140_a3-pure.cl
+++ b/OpenCL/m00140_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00150_a0-optimized.cl b/OpenCL/m00150_a0-optimized.cl
index 65e8267bd..4e4d4c822 100644
--- a/OpenCL/m00150_a0-optimized.cl
+++ b/OpenCL/m00150_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00150_a0-pure.cl b/OpenCL/m00150_a0-pure.cl
index 47d679903..1e9bf4e27 100644
--- a/OpenCL/m00150_a0-pure.cl
+++ b/OpenCL/m00150_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00150_a1-optimized.cl b/OpenCL/m00150_a1-optimized.cl
index 3c2274d21..7ced69a81 100644
--- a/OpenCL/m00150_a1-optimized.cl
+++ b/OpenCL/m00150_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00150_a1-pure.cl b/OpenCL/m00150_a1-pure.cl
index 24153ff0f..9278eba7d 100644
--- a/OpenCL/m00150_a1-pure.cl
+++ b/OpenCL/m00150_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00150_a3-optimized.cl b/OpenCL/m00150_a3-optimized.cl
index 45d124de5..914ea49a1 100644
--- a/OpenCL/m00150_a3-optimized.cl
+++ b/OpenCL/m00150_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00150_a3-pure.cl b/OpenCL/m00150_a3-pure.cl
index f4703db48..a77d52177 100644
--- a/OpenCL/m00150_a3-pure.cl
+++ b/OpenCL/m00150_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00160_a0-optimized.cl b/OpenCL/m00160_a0-optimized.cl
index 7a1be5004..eabf353d8 100644
--- a/OpenCL/m00160_a0-optimized.cl
+++ b/OpenCL/m00160_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00160_a0-pure.cl b/OpenCL/m00160_a0-pure.cl
index 5ae8c69d1..700766036 100644
--- a/OpenCL/m00160_a0-pure.cl
+++ b/OpenCL/m00160_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00160_a1-optimized.cl b/OpenCL/m00160_a1-optimized.cl
index 894c31530..bbf9c1a4e 100644
--- a/OpenCL/m00160_a1-optimized.cl
+++ b/OpenCL/m00160_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00160_a1-pure.cl b/OpenCL/m00160_a1-pure.cl
index 39a1a1c10..0b7b9a97d 100644
--- a/OpenCL/m00160_a1-pure.cl
+++ b/OpenCL/m00160_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00160_a3-optimized.cl b/OpenCL/m00160_a3-optimized.cl
index ff0e7823a..e228783a4 100644
--- a/OpenCL/m00160_a3-optimized.cl
+++ b/OpenCL/m00160_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00160_a3-pure.cl b/OpenCL/m00160_a3-pure.cl
index 24fb6e0ef..3d1e1e650 100644
--- a/OpenCL/m00160_a3-pure.cl
+++ b/OpenCL/m00160_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00200_a0-optimized.cl b/OpenCL/m00200_a0-optimized.cl
index 88144ff0a..c5d334f36 100644
--- a/OpenCL/m00200_a0-optimized.cl
+++ b/OpenCL/m00200_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00200_a1-optimized.cl b/OpenCL/m00200_a1-optimized.cl
index 0d907e343..5589a386a 100644
--- a/OpenCL/m00200_a1-optimized.cl
+++ b/OpenCL/m00200_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
diff --git a/OpenCL/m00200_a3-optimized.cl b/OpenCL/m00200_a3-optimized.cl
index 61481aa69..74a1c3234 100644
--- a/OpenCL/m00200_a3-optimized.cl
+++ b/OpenCL/m00200_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
diff --git a/OpenCL/m00300_a0-optimized.cl b/OpenCL/m00300_a0-optimized.cl
index df387712d..22294dd64 100644
--- a/OpenCL/m00300_a0-optimized.cl
+++ b/OpenCL/m00300_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00300_a0-pure.cl b/OpenCL/m00300_a0-pure.cl
index 65209c666..4dcff77d8 100644
--- a/OpenCL/m00300_a0-pure.cl
+++ b/OpenCL/m00300_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00300_a1-optimized.cl b/OpenCL/m00300_a1-optimized.cl
index 465d60418..03dfeb1c5 100644
--- a/OpenCL/m00300_a1-optimized.cl
+++ b/OpenCL/m00300_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00300_a1-pure.cl b/OpenCL/m00300_a1-pure.cl
index 0d0614358..63728805b 100644
--- a/OpenCL/m00300_a1-pure.cl
+++ b/OpenCL/m00300_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00300_a3-optimized.cl b/OpenCL/m00300_a3-optimized.cl
index 4fc9de868..2ae4ec0e9 100644
--- a/OpenCL/m00300_a3-optimized.cl
+++ b/OpenCL/m00300_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00300_a3-pure.cl b/OpenCL/m00300_a3-pure.cl
index 4d4865da2..333354daf 100644
--- a/OpenCL/m00300_a3-pure.cl
+++ b/OpenCL/m00300_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m00400-optimized.cl b/OpenCL/m00400-optimized.cl
index 56c4c3956..a66c7b02d 100644
--- a/OpenCL/m00400-optimized.cl
+++ b/OpenCL/m00400-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00400-pure.cl b/OpenCL/m00400-pure.cl
index c251a0380..5eeb0b36d 100644
--- a/OpenCL/m00400-pure.cl
+++ b/OpenCL/m00400-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m00500-optimized.cl b/OpenCL/m00500-optimized.cl
index ba7b6f83f..b530a23d4 100644
--- a/OpenCL/m00500-optimized.cl
+++ b/OpenCL/m00500-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md5.cl"
 #endif
diff --git a/OpenCL/m00500-pure.cl b/OpenCL/m00500-pure.cl
index d30b7d1ae..cbee878dd 100644
--- a/OpenCL/m00500-pure.cl
+++ b/OpenCL/m00500-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md5.cl"
 #endif
diff --git a/OpenCL/m00600_a0-optimized.cl b/OpenCL/m00600_a0-optimized.cl
index 024dae362..e72920690 100644
--- a/OpenCL/m00600_a0-optimized.cl
+++ b/OpenCL/m00600_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00600_a1-optimized.cl b/OpenCL/m00600_a1-optimized.cl
index 9af3a5fbe..9547523ac 100644
--- a/OpenCL/m00600_a1-optimized.cl
+++ b/OpenCL/m00600_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
diff --git a/OpenCL/m00600_a3-optimized.cl b/OpenCL/m00600_a3-optimized.cl
index 56cb67eb7..2545bdca5 100644
--- a/OpenCL/m00600_a3-optimized.cl
+++ b/OpenCL/m00600_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
diff --git a/OpenCL/m00900_a0-optimized.cl b/OpenCL/m00900_a0-optimized.cl
index d7019c463..5f6dff580 100644
--- a/OpenCL/m00900_a0-optimized.cl
+++ b/OpenCL/m00900_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m00900_a0-pure.cl b/OpenCL/m00900_a0-pure.cl
index a3d36f1f0..e8590bfd5 100644
--- a/OpenCL/m00900_a0-pure.cl
+++ b/OpenCL/m00900_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m00900_a1-optimized.cl b/OpenCL/m00900_a1-optimized.cl
index 759fe7d42..b7df87f0a 100644
--- a/OpenCL/m00900_a1-optimized.cl
+++ b/OpenCL/m00900_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m00900_a1-pure.cl b/OpenCL/m00900_a1-pure.cl
index b7c4d13c7..518656787 100644
--- a/OpenCL/m00900_a1-pure.cl
+++ b/OpenCL/m00900_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m00900_a3-optimized.cl b/OpenCL/m00900_a3-optimized.cl
index a2c3b978b..11e21a483 100644
--- a/OpenCL/m00900_a3-optimized.cl
+++ b/OpenCL/m00900_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m00900_a3-pure.cl b/OpenCL/m00900_a3-pure.cl
index 40f3e297c..49899c0c2 100644
--- a/OpenCL/m00900_a3-pure.cl
+++ b/OpenCL/m00900_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m01000_a0-optimized.cl b/OpenCL/m01000_a0-optimized.cl
index 70127a82a..802ab947b 100644
--- a/OpenCL/m01000_a0-optimized.cl
+++ b/OpenCL/m01000_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m01000_a0-pure.cl b/OpenCL/m01000_a0-pure.cl
index 1dbdab60d..dee609c2c 100644
--- a/OpenCL/m01000_a0-pure.cl
+++ b/OpenCL/m01000_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01000_a1-optimized.cl b/OpenCL/m01000_a1-optimized.cl
index eef1852e8..e3e51132d 100644
--- a/OpenCL/m01000_a1-optimized.cl
+++ b/OpenCL/m01000_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m01000_a1-pure.cl b/OpenCL/m01000_a1-pure.cl
index 71b2ab2e4..a08e6fedc 100644
--- a/OpenCL/m01000_a1-pure.cl
+++ b/OpenCL/m01000_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m01000_a3-optimized.cl b/OpenCL/m01000_a3-optimized.cl
index 9ede59c48..2953dca1e 100644
--- a/OpenCL/m01000_a3-optimized.cl
+++ b/OpenCL/m01000_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m01000_a3-pure.cl b/OpenCL/m01000_a3-pure.cl
index 91c34a4de..ff3f6a2da 100644
--- a/OpenCL/m01000_a3-pure.cl
+++ b/OpenCL/m01000_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m01100_a0-optimized.cl b/OpenCL/m01100_a0-optimized.cl
index bd0b99fe9..ba7a08b55 100644
--- a/OpenCL/m01100_a0-optimized.cl
+++ b/OpenCL/m01100_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -42,7 +43,7 @@ KERNEL_FQ void m01100_m04 (KERN_ATTR_RULES ())
    * salt
    */
 
-  LOCAL_AS salt_t s_salt_buf[1];
+  LOCAL_VK salt_t s_salt_buf[1];
 
   if (lid == 0)
   {
@@ -51,7 +52,7 @@ KERNEL_FQ void m01100_m04 (KERN_ATTR_RULES ())
     s_salt_buf[0].salt_buf[10] = (16 + s_salt_buf[0].salt_len) * 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -261,7 +262,7 @@ KERNEL_FQ void m01100_s04 (KERN_ATTR_RULES ())
    * salt
    */
 
-  LOCAL_AS salt_t s_salt_buf[1];
+  LOCAL_VK salt_t s_salt_buf[1];
 
   if (lid == 0)
   {
@@ -270,7 +271,7 @@ KERNEL_FQ void m01100_s04 (KERN_ATTR_RULES ())
     s_salt_buf[0].salt_buf[10] = (16 + s_salt_buf[0].salt_len) * 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m01100_a0-pure.cl b/OpenCL/m01100_a0-pure.cl
index 311bf0a12..f7465d7f3 100644
--- a/OpenCL/m01100_a0-pure.cl
+++ b/OpenCL/m01100_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01100_a1-optimized.cl b/OpenCL/m01100_a1-optimized.cl
index 64a69d4b1..466e3ff30 100644
--- a/OpenCL/m01100_a1-optimized.cl
+++ b/OpenCL/m01100_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -40,7 +41,7 @@ KERNEL_FQ void m01100_m04 (KERN_ATTR_BASIC ())
    * salt
    */
 
-  LOCAL_AS salt_t s_salt_buf[1];
+  LOCAL_VK salt_t s_salt_buf[1];
 
   if (lid == 0)
   {
@@ -49,7 +50,7 @@ KERNEL_FQ void m01100_m04 (KERN_ATTR_BASIC ())
     s_salt_buf[0].salt_buf[10] = (16 + s_salt_buf[0].salt_len) * 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -321,7 +322,7 @@ KERNEL_FQ void m01100_s04 (KERN_ATTR_BASIC ())
    * salt
    */
 
-  LOCAL_AS salt_t s_salt_buf[1];
+  LOCAL_VK salt_t s_salt_buf[1];
 
   if (lid == 0)
   {
@@ -330,7 +331,7 @@ KERNEL_FQ void m01100_s04 (KERN_ATTR_BASIC ())
     s_salt_buf[0].salt_buf[10] = (16 + s_salt_buf[0].salt_len) * 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m01100_a1-pure.cl b/OpenCL/m01100_a1-pure.cl
index d7d218ad6..ca4fad103 100644
--- a/OpenCL/m01100_a1-pure.cl
+++ b/OpenCL/m01100_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m01100_a3-optimized.cl b/OpenCL/m01100_a3-optimized.cl
index 73d1b826d..a7da9f741 100644
--- a/OpenCL/m01100_a3-optimized.cl
+++ b/OpenCL/m01100_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -532,7 +533,7 @@ KERNEL_FQ void m01100_m04 (KERN_ATTR_VECTOR ())
    * salt
    */
 
-  LOCAL_AS salt_t s_salt_buf[1];
+  LOCAL_VK salt_t s_salt_buf[1];
 
   if (lid == 0)
   {
@@ -541,7 +542,7 @@ KERNEL_FQ void m01100_m04 (KERN_ATTR_VECTOR ())
     s_salt_buf[0].salt_buf[10] = (16 + s_salt_buf[0].salt_len) * 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -586,7 +587,7 @@ KERNEL_FQ void m01100_m08 (KERN_ATTR_VECTOR ())
    * salt
    */
 
-  LOCAL_AS salt_t s_salt_buf[1];
+  LOCAL_VK salt_t s_salt_buf[1];
 
   if (lid == 0)
   {
@@ -595,7 +596,7 @@ KERNEL_FQ void m01100_m08 (KERN_ATTR_VECTOR ())
     s_salt_buf[0].salt_buf[10] = (16 + s_salt_buf[0].salt_len) * 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -640,7 +641,7 @@ KERNEL_FQ void m01100_m16 (KERN_ATTR_VECTOR ())
    * salt
    */
 
-  LOCAL_AS salt_t s_salt_buf[1];
+  LOCAL_VK salt_t s_salt_buf[1];
 
   if (lid == 0)
   {
@@ -649,7 +650,7 @@ KERNEL_FQ void m01100_m16 (KERN_ATTR_VECTOR ())
     s_salt_buf[0].salt_buf[10] = (16 + s_salt_buf[0].salt_len) * 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -694,7 +695,7 @@ KERNEL_FQ void m01100_s04 (KERN_ATTR_VECTOR ())
    * salt
    */
 
-  LOCAL_AS salt_t s_salt_buf[1];
+  LOCAL_VK salt_t s_salt_buf[1];
 
   if (lid == 0)
   {
@@ -703,7 +704,7 @@ KERNEL_FQ void m01100_s04 (KERN_ATTR_VECTOR ())
     s_salt_buf[0].salt_buf[10] = (16 + s_salt_buf[0].salt_len) * 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -748,7 +749,7 @@ KERNEL_FQ void m01100_s08 (KERN_ATTR_VECTOR ())
    * salt
    */
 
-  LOCAL_AS salt_t s_salt_buf[1];
+  LOCAL_VK salt_t s_salt_buf[1];
 
   if (lid == 0)
   {
@@ -757,7 +758,7 @@ KERNEL_FQ void m01100_s08 (KERN_ATTR_VECTOR ())
     s_salt_buf[0].salt_buf[10] = (16 + s_salt_buf[0].salt_len) * 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -802,7 +803,7 @@ KERNEL_FQ void m01100_s16 (KERN_ATTR_VECTOR ())
    * salt
    */
 
-  LOCAL_AS salt_t s_salt_buf[1];
+  LOCAL_VK salt_t s_salt_buf[1];
 
   if (lid == 0)
   {
@@ -811,7 +812,7 @@ KERNEL_FQ void m01100_s16 (KERN_ATTR_VECTOR ())
     s_salt_buf[0].salt_buf[10] = (16 + s_salt_buf[0].salt_len) * 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m01100_a3-pure.cl b/OpenCL/m01100_a3-pure.cl
index 155828ab5..f55770378 100644
--- a/OpenCL/m01100_a3-pure.cl
+++ b/OpenCL/m01100_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m01300_a0-optimized.cl b/OpenCL/m01300_a0-optimized.cl
index 9b19415d1..90fb84b27 100644
--- a/OpenCL/m01300_a0-optimized.cl
+++ b/OpenCL/m01300_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m01300_a0-pure.cl b/OpenCL/m01300_a0-pure.cl
index c837946b9..54ef39265 100644
--- a/OpenCL/m01300_a0-pure.cl
+++ b/OpenCL/m01300_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01300_a1-optimized.cl b/OpenCL/m01300_a1-optimized.cl
index 090c07fe8..fedc61b6d 100644
--- a/OpenCL/m01300_a1-optimized.cl
+++ b/OpenCL/m01300_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha224.cl"
diff --git a/OpenCL/m01300_a1-pure.cl b/OpenCL/m01300_a1-pure.cl
index 54f9ecbf2..3a22e5f8a 100644
--- a/OpenCL/m01300_a1-pure.cl
+++ b/OpenCL/m01300_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha224.cl"
diff --git a/OpenCL/m01300_a3-optimized.cl b/OpenCL/m01300_a3-optimized.cl
index 94e7dadf1..23b8385f5 100644
--- a/OpenCL/m01300_a3-optimized.cl
+++ b/OpenCL/m01300_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha224.cl"
diff --git a/OpenCL/m01300_a3-pure.cl b/OpenCL/m01300_a3-pure.cl
index c5c54972a..1fdc37fdd 100644
--- a/OpenCL/m01300_a3-pure.cl
+++ b/OpenCL/m01300_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha224.cl"
diff --git a/OpenCL/m01400_a0-optimized.cl b/OpenCL/m01400_a0-optimized.cl
index 047b4765f..6b1dc826b 100644
--- a/OpenCL/m01400_a0-optimized.cl
+++ b/OpenCL/m01400_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m01400_a0-pure.cl b/OpenCL/m01400_a0-pure.cl
index f591ac0a6..367430536 100644
--- a/OpenCL/m01400_a0-pure.cl
+++ b/OpenCL/m01400_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01400_a1-optimized.cl b/OpenCL/m01400_a1-optimized.cl
index 15cfde5c5..ca0f0d639 100644
--- a/OpenCL/m01400_a1-optimized.cl
+++ b/OpenCL/m01400_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01400_a1-pure.cl b/OpenCL/m01400_a1-pure.cl
index 2f29bfeaa..4489e8a8b 100644
--- a/OpenCL/m01400_a1-pure.cl
+++ b/OpenCL/m01400_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01400_a3-optimized.cl b/OpenCL/m01400_a3-optimized.cl
index e2a3694f4..48aa24f8a 100644
--- a/OpenCL/m01400_a3-optimized.cl
+++ b/OpenCL/m01400_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01400_a3-pure.cl b/OpenCL/m01400_a3-pure.cl
index 50ae4b91e..86d97ba44 100644
--- a/OpenCL/m01400_a3-pure.cl
+++ b/OpenCL/m01400_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01410_a0-optimized.cl b/OpenCL/m01410_a0-optimized.cl
index 6fae8211a..00e551dd5 100644
--- a/OpenCL/m01410_a0-optimized.cl
+++ b/OpenCL/m01410_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m01410_a0-pure.cl b/OpenCL/m01410_a0-pure.cl
index 852574eec..bd91db24a 100644
--- a/OpenCL/m01410_a0-pure.cl
+++ b/OpenCL/m01410_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01410_a1-optimized.cl b/OpenCL/m01410_a1-optimized.cl
index 757bc6cb0..165e953b0 100644
--- a/OpenCL/m01410_a1-optimized.cl
+++ b/OpenCL/m01410_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01410_a1-pure.cl b/OpenCL/m01410_a1-pure.cl
index 4ca539c91..89772c07b 100644
--- a/OpenCL/m01410_a1-pure.cl
+++ b/OpenCL/m01410_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01410_a3-optimized.cl b/OpenCL/m01410_a3-optimized.cl
index 69b680bd9..65752388c 100644
--- a/OpenCL/m01410_a3-optimized.cl
+++ b/OpenCL/m01410_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01410_a3-pure.cl b/OpenCL/m01410_a3-pure.cl
index ec5c0fd00..a362f0f06 100644
--- a/OpenCL/m01410_a3-pure.cl
+++ b/OpenCL/m01410_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01420_a0-optimized.cl b/OpenCL/m01420_a0-optimized.cl
index 154263591..a506c1d8f 100644
--- a/OpenCL/m01420_a0-optimized.cl
+++ b/OpenCL/m01420_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m01420_a0-pure.cl b/OpenCL/m01420_a0-pure.cl
index 975487ae7..eac257dde 100644
--- a/OpenCL/m01420_a0-pure.cl
+++ b/OpenCL/m01420_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01420_a1-optimized.cl b/OpenCL/m01420_a1-optimized.cl
index 43030b471..91fa89196 100644
--- a/OpenCL/m01420_a1-optimized.cl
+++ b/OpenCL/m01420_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01420_a1-pure.cl b/OpenCL/m01420_a1-pure.cl
index 6a0b7747b..bd50b619c 100644
--- a/OpenCL/m01420_a1-pure.cl
+++ b/OpenCL/m01420_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01420_a3-optimized.cl b/OpenCL/m01420_a3-optimized.cl
index 3915e0d7f..b19838862 100644
--- a/OpenCL/m01420_a3-optimized.cl
+++ b/OpenCL/m01420_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01420_a3-pure.cl b/OpenCL/m01420_a3-pure.cl
index 197729d3a..b8b7d21c0 100644
--- a/OpenCL/m01420_a3-pure.cl
+++ b/OpenCL/m01420_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01430_a0-optimized.cl b/OpenCL/m01430_a0-optimized.cl
index 0839a7e9e..61077c22e 100644
--- a/OpenCL/m01430_a0-optimized.cl
+++ b/OpenCL/m01430_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m01430_a0-pure.cl b/OpenCL/m01430_a0-pure.cl
index 4ed0a82e5..af950d387 100644
--- a/OpenCL/m01430_a0-pure.cl
+++ b/OpenCL/m01430_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01430_a1-optimized.cl b/OpenCL/m01430_a1-optimized.cl
index 70e11482f..4b1c38642 100644
--- a/OpenCL/m01430_a1-optimized.cl
+++ b/OpenCL/m01430_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01430_a1-pure.cl b/OpenCL/m01430_a1-pure.cl
index 3b5db7ce1..b4597fd62 100644
--- a/OpenCL/m01430_a1-pure.cl
+++ b/OpenCL/m01430_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01430_a3-optimized.cl b/OpenCL/m01430_a3-optimized.cl
index 7122b44b2..8476d60d2 100644
--- a/OpenCL/m01430_a3-optimized.cl
+++ b/OpenCL/m01430_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01430_a3-pure.cl b/OpenCL/m01430_a3-pure.cl
index 3827a7ca7..7ed56551b 100644
--- a/OpenCL/m01430_a3-pure.cl
+++ b/OpenCL/m01430_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01440_a0-optimized.cl b/OpenCL/m01440_a0-optimized.cl
index 33383622b..23e81cee4 100644
--- a/OpenCL/m01440_a0-optimized.cl
+++ b/OpenCL/m01440_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m01440_a0-pure.cl b/OpenCL/m01440_a0-pure.cl
index 1140dd0b3..d602fb4ba 100644
--- a/OpenCL/m01440_a0-pure.cl
+++ b/OpenCL/m01440_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01440_a1-optimized.cl b/OpenCL/m01440_a1-optimized.cl
index d82c59e7f..90b1f369d 100644
--- a/OpenCL/m01440_a1-optimized.cl
+++ b/OpenCL/m01440_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01440_a1-pure.cl b/OpenCL/m01440_a1-pure.cl
index 31b96ac18..7277bfa91 100644
--- a/OpenCL/m01440_a1-pure.cl
+++ b/OpenCL/m01440_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01440_a3-optimized.cl b/OpenCL/m01440_a3-optimized.cl
index f41486ab4..ce9719a32 100644
--- a/OpenCL/m01440_a3-optimized.cl
+++ b/OpenCL/m01440_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01440_a3-pure.cl b/OpenCL/m01440_a3-pure.cl
index 91208330e..3aad0e874 100644
--- a/OpenCL/m01440_a3-pure.cl
+++ b/OpenCL/m01440_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01450_a0-optimized.cl b/OpenCL/m01450_a0-optimized.cl
index fb0a46c6d..c3716f329 100644
--- a/OpenCL/m01450_a0-optimized.cl
+++ b/OpenCL/m01450_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m01450_a0-pure.cl b/OpenCL/m01450_a0-pure.cl
index 6febabf09..036062aa0 100644
--- a/OpenCL/m01450_a0-pure.cl
+++ b/OpenCL/m01450_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01450_a1-optimized.cl b/OpenCL/m01450_a1-optimized.cl
index 32c27e05a..1ff4e577f 100644
--- a/OpenCL/m01450_a1-optimized.cl
+++ b/OpenCL/m01450_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01450_a1-pure.cl b/OpenCL/m01450_a1-pure.cl
index 2dc862f8e..5519e2ce7 100644
--- a/OpenCL/m01450_a1-pure.cl
+++ b/OpenCL/m01450_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01450_a3-optimized.cl b/OpenCL/m01450_a3-optimized.cl
index 20e2730c3..039ea3f5d 100644
--- a/OpenCL/m01450_a3-optimized.cl
+++ b/OpenCL/m01450_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01450_a3-pure.cl b/OpenCL/m01450_a3-pure.cl
index 7a73dfacd..2c60c55d2 100644
--- a/OpenCL/m01450_a3-pure.cl
+++ b/OpenCL/m01450_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01460_a0-optimized.cl b/OpenCL/m01460_a0-optimized.cl
index 709688c82..aa4552081 100644
--- a/OpenCL/m01460_a0-optimized.cl
+++ b/OpenCL/m01460_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m01460_a0-pure.cl b/OpenCL/m01460_a0-pure.cl
index c6e2abad6..431d999ad 100644
--- a/OpenCL/m01460_a0-pure.cl
+++ b/OpenCL/m01460_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01460_a1-optimized.cl b/OpenCL/m01460_a1-optimized.cl
index 3edbecca7..6ab70a48a 100644
--- a/OpenCL/m01460_a1-optimized.cl
+++ b/OpenCL/m01460_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01460_a1-pure.cl b/OpenCL/m01460_a1-pure.cl
index b40516132..970130141 100644
--- a/OpenCL/m01460_a1-pure.cl
+++ b/OpenCL/m01460_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01460_a3-optimized.cl b/OpenCL/m01460_a3-optimized.cl
index 67121ae53..07cf8a797 100644
--- a/OpenCL/m01460_a3-optimized.cl
+++ b/OpenCL/m01460_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01460_a3-pure.cl b/OpenCL/m01460_a3-pure.cl
index 01deffbae..70b497159 100644
--- a/OpenCL/m01460_a3-pure.cl
+++ b/OpenCL/m01460_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m01500_a0-pure.cl b/OpenCL/m01500_a0-pure.cl
index f3a4d5913..2b7b234a7 100644
--- a/OpenCL/m01500_a0-pure.cl
+++ b/OpenCL/m01500_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -34,7 +35,7 @@
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x00820200, 0x00020000, 0x80800000, 0x80820200,
@@ -182,7 +183,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -333,13 +334,13 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_keysetup (u32 c, u32x d, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_skb)[64])
@@ -495,8 +496,8 @@ KERNEL_FQ void m01500_mxx (KERN_ATTR_RULES ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -519,7 +520,7 @@ KERNEL_FQ void m01500_mxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -579,8 +580,8 @@ KERNEL_FQ void m01500_sxx (KERN_ATTR_RULES ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -603,7 +604,7 @@ KERNEL_FQ void m01500_sxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m01500_a1-pure.cl b/OpenCL/m01500_a1-pure.cl
index 88a06b749..e7ee552b8 100644
--- a/OpenCL/m01500_a1-pure.cl
+++ b/OpenCL/m01500_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
@@ -32,7 +33,7 @@
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x00820200, 0x00020000, 0x80800000, 0x80820200,
@@ -180,7 +181,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -331,13 +332,13 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_keysetup (u32 c, u32x d, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_skb)[64])
@@ -493,8 +494,8 @@ KERNEL_FQ void m01500_mxx (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -517,7 +518,7 @@ KERNEL_FQ void m01500_mxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -656,8 +657,8 @@ KERNEL_FQ void m01500_sxx (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -680,7 +681,7 @@ KERNEL_FQ void m01500_sxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m01500_a3-pure.cl b/OpenCL/m01500_a3-pure.cl
index ce3cebf22..ca612828d 100644
--- a/OpenCL/m01500_a3-pure.cl
+++ b/OpenCL/m01500_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #endif
 
@@ -1885,7 +1886,7 @@ DECLSPEC void transpose32c (u32 *data)
 // transpose bitslice mod : attention race conditions, need different buffers for *in and *out
 //
 
-KERNEL_FQ void m01500_tm (GLOBAL_AS u32 *mod, GLOBAL_AS bs_word_t *words_buf_r)
+KERNEL_FQ void m01500_tm (GLOBAL_AS u32 *mod, GLOBAL_AS bs_word_t *words_buf_b)
 {
   const u64 gid = get_global_id (0);
 
@@ -1901,13 +1902,13 @@ KERNEL_FQ void m01500_tm (GLOBAL_AS u32 *mod, GLOBAL_AS bs_word_t *words_buf_r)
   #endif
   for (int i = 0, j = 0; i < 32; i += 8, j += 7)
   {
-    atomic_or (&words_buf_r[block].b[j + 0], (((w0s >> (i + 7)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 1], (((w0s >> (i + 6)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 2], (((w0s >> (i + 5)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 3], (((w0s >> (i + 4)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 4], (((w0s >> (i + 3)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 5], (((w0s >> (i + 2)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 6], (((w0s >> (i + 1)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 0], (((w0s >> (i + 7)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 1], (((w0s >> (i + 6)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 2], (((w0s >> (i + 5)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 3], (((w0s >> (i + 4)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 4], (((w0s >> (i + 3)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 5], (((w0s >> (i + 2)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 6], (((w0s >> (i + 1)) & 1) << slice));
   }
 }
 
@@ -1997,7 +1998,11 @@ KERNEL_FQ void m01500_mxx (KERN_ATTR_BITSLICE ())
    * inner loop
    */
 
+  #ifdef IS_CUDA
+  const u32 pc_pos = (blockIdx.y * blockDim.y) + threadIdx.y;
+  #else
   const u32 pc_pos = get_global_id (1);
+  #endif
 
   const u32 il_pos = pc_pos * 32;
 
@@ -2030,34 +2035,34 @@ KERNEL_FQ void m01500_mxx (KERN_ATTR_BITSLICE ())
   u32 k26 = K26;
   u32 k27 = K27;
 
-  k00 |= words_buf_r[pc_pos].b[ 0];
-  k01 |= words_buf_r[pc_pos].b[ 1];
-  k02 |= words_buf_r[pc_pos].b[ 2];
-  k03 |= words_buf_r[pc_pos].b[ 3];
-  k04 |= words_buf_r[pc_pos].b[ 4];
-  k05 |= words_buf_r[pc_pos].b[ 5];
-  k06 |= words_buf_r[pc_pos].b[ 6];
-  k07 |= words_buf_r[pc_pos].b[ 7];
-  k08 |= words_buf_r[pc_pos].b[ 8];
-  k09 |= words_buf_r[pc_pos].b[ 9];
-  k10 |= words_buf_r[pc_pos].b[10];
-  k11 |= words_buf_r[pc_pos].b[11];
-  k12 |= words_buf_r[pc_pos].b[12];
-  k13 |= words_buf_r[pc_pos].b[13];
-  k14 |= words_buf_r[pc_pos].b[14];
-  k15 |= words_buf_r[pc_pos].b[15];
-  k16 |= words_buf_r[pc_pos].b[16];
-  k17 |= words_buf_r[pc_pos].b[17];
-  k18 |= words_buf_r[pc_pos].b[18];
-  k19 |= words_buf_r[pc_pos].b[19];
-  k20 |= words_buf_r[pc_pos].b[20];
-  k21 |= words_buf_r[pc_pos].b[21];
-  k22 |= words_buf_r[pc_pos].b[22];
-  k23 |= words_buf_r[pc_pos].b[23];
-  k24 |= words_buf_r[pc_pos].b[24];
-  k25 |= words_buf_r[pc_pos].b[25];
-  k26 |= words_buf_r[pc_pos].b[26];
-  k27 |= words_buf_r[pc_pos].b[27];
+  k00 |= words_buf_s[pc_pos].b[ 0];
+  k01 |= words_buf_s[pc_pos].b[ 1];
+  k02 |= words_buf_s[pc_pos].b[ 2];
+  k03 |= words_buf_s[pc_pos].b[ 3];
+  k04 |= words_buf_s[pc_pos].b[ 4];
+  k05 |= words_buf_s[pc_pos].b[ 5];
+  k06 |= words_buf_s[pc_pos].b[ 6];
+  k07 |= words_buf_s[pc_pos].b[ 7];
+  k08 |= words_buf_s[pc_pos].b[ 8];
+  k09 |= words_buf_s[pc_pos].b[ 9];
+  k10 |= words_buf_s[pc_pos].b[10];
+  k11 |= words_buf_s[pc_pos].b[11];
+  k12 |= words_buf_s[pc_pos].b[12];
+  k13 |= words_buf_s[pc_pos].b[13];
+  k14 |= words_buf_s[pc_pos].b[14];
+  k15 |= words_buf_s[pc_pos].b[15];
+  k16 |= words_buf_s[pc_pos].b[16];
+  k17 |= words_buf_s[pc_pos].b[17];
+  k18 |= words_buf_s[pc_pos].b[18];
+  k19 |= words_buf_s[pc_pos].b[19];
+  k20 |= words_buf_s[pc_pos].b[20];
+  k21 |= words_buf_s[pc_pos].b[21];
+  k22 |= words_buf_s[pc_pos].b[22];
+  k23 |= words_buf_s[pc_pos].b[23];
+  k24 |= words_buf_s[pc_pos].b[24];
+  k25 |= words_buf_s[pc_pos].b[25];
+  k26 |= words_buf_s[pc_pos].b[26];
+  k27 |= words_buf_s[pc_pos].b[27];
 
   u32 D00 = 0;
   u32 D01 = 0;
@@ -2445,7 +2450,11 @@ KERNEL_FQ void m01500_sxx (KERN_ATTR_BITSLICE ())
    * inner loop
    */
 
+  #ifdef IS_CUDA
+  const u32 pc_pos = (blockIdx.y * blockDim.y) + threadIdx.y;
+  #else
   const u32 pc_pos = get_global_id (1);
+  #endif
 
   const u32 il_pos = pc_pos * 32;
 
@@ -2478,34 +2487,34 @@ KERNEL_FQ void m01500_sxx (KERN_ATTR_BITSLICE ())
   u32 k26 = K26;
   u32 k27 = K27;
 
-  k00 |= words_buf_r[pc_pos].b[ 0];
-  k01 |= words_buf_r[pc_pos].b[ 1];
-  k02 |= words_buf_r[pc_pos].b[ 2];
-  k03 |= words_buf_r[pc_pos].b[ 3];
-  k04 |= words_buf_r[pc_pos].b[ 4];
-  k05 |= words_buf_r[pc_pos].b[ 5];
-  k06 |= words_buf_r[pc_pos].b[ 6];
-  k07 |= words_buf_r[pc_pos].b[ 7];
-  k08 |= words_buf_r[pc_pos].b[ 8];
-  k09 |= words_buf_r[pc_pos].b[ 9];
-  k10 |= words_buf_r[pc_pos].b[10];
-  k11 |= words_buf_r[pc_pos].b[11];
-  k12 |= words_buf_r[pc_pos].b[12];
-  k13 |= words_buf_r[pc_pos].b[13];
-  k14 |= words_buf_r[pc_pos].b[14];
-  k15 |= words_buf_r[pc_pos].b[15];
-  k16 |= words_buf_r[pc_pos].b[16];
-  k17 |= words_buf_r[pc_pos].b[17];
-  k18 |= words_buf_r[pc_pos].b[18];
-  k19 |= words_buf_r[pc_pos].b[19];
-  k20 |= words_buf_r[pc_pos].b[20];
-  k21 |= words_buf_r[pc_pos].b[21];
-  k22 |= words_buf_r[pc_pos].b[22];
-  k23 |= words_buf_r[pc_pos].b[23];
-  k24 |= words_buf_r[pc_pos].b[24];
-  k25 |= words_buf_r[pc_pos].b[25];
-  k26 |= words_buf_r[pc_pos].b[26];
-  k27 |= words_buf_r[pc_pos].b[27];
+  k00 |= words_buf_s[pc_pos].b[ 0];
+  k01 |= words_buf_s[pc_pos].b[ 1];
+  k02 |= words_buf_s[pc_pos].b[ 2];
+  k03 |= words_buf_s[pc_pos].b[ 3];
+  k04 |= words_buf_s[pc_pos].b[ 4];
+  k05 |= words_buf_s[pc_pos].b[ 5];
+  k06 |= words_buf_s[pc_pos].b[ 6];
+  k07 |= words_buf_s[pc_pos].b[ 7];
+  k08 |= words_buf_s[pc_pos].b[ 8];
+  k09 |= words_buf_s[pc_pos].b[ 9];
+  k10 |= words_buf_s[pc_pos].b[10];
+  k11 |= words_buf_s[pc_pos].b[11];
+  k12 |= words_buf_s[pc_pos].b[12];
+  k13 |= words_buf_s[pc_pos].b[13];
+  k14 |= words_buf_s[pc_pos].b[14];
+  k15 |= words_buf_s[pc_pos].b[15];
+  k16 |= words_buf_s[pc_pos].b[16];
+  k17 |= words_buf_s[pc_pos].b[17];
+  k18 |= words_buf_s[pc_pos].b[18];
+  k19 |= words_buf_s[pc_pos].b[19];
+  k20 |= words_buf_s[pc_pos].b[20];
+  k21 |= words_buf_s[pc_pos].b[21];
+  k22 |= words_buf_s[pc_pos].b[22];
+  k23 |= words_buf_s[pc_pos].b[23];
+  k24 |= words_buf_s[pc_pos].b[24];
+  k25 |= words_buf_s[pc_pos].b[25];
+  k26 |= words_buf_s[pc_pos].b[26];
+  k27 |= words_buf_s[pc_pos].b[27];
 
   u32 D00 = 0;
   u32 D01 = 0;
diff --git a/OpenCL/m01600-optimized.cl b/OpenCL/m01600-optimized.cl
index 66dbd88b7..e1110dc00 100644
--- a/OpenCL/m01600-optimized.cl
+++ b/OpenCL/m01600-optimized.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md5.cl"
 #endif
diff --git a/OpenCL/m01600-pure.cl b/OpenCL/m01600-pure.cl
index 129ff6ab2..af5972c7a 100644
--- a/OpenCL/m01600-pure.cl
+++ b/OpenCL/m01600-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md5.cl"
 #endif
diff --git a/OpenCL/m01700_a0-optimized.cl b/OpenCL/m01700_a0-optimized.cl
index 695c97f08..0d6ddb337 100644
--- a/OpenCL/m01700_a0-optimized.cl
+++ b/OpenCL/m01700_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -85,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -92,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01700_a0-pure.cl b/OpenCL/m01700_a0-pure.cl
index d6a1f5323..0e4aba9b7 100644
--- a/OpenCL/m01700_a0-pure.cl
+++ b/OpenCL/m01700_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01700_a1-optimized.cl b/OpenCL/m01700_a1-optimized.cl
index 3b42628f8..abee4dfcb 100644
--- a/OpenCL/m01700_a1-optimized.cl
+++ b/OpenCL/m01700_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01700_a1-pure.cl b/OpenCL/m01700_a1-pure.cl
index 0a5b46d6d..e3286cce8 100644
--- a/OpenCL/m01700_a1-pure.cl
+++ b/OpenCL/m01700_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01700_a3-optimized.cl b/OpenCL/m01700_a3-optimized.cl
index e56c00969..c4d8ee016 100644
--- a/OpenCL/m01700_a3-optimized.cl
+++ b/OpenCL/m01700_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01700_a3-pure.cl b/OpenCL/m01700_a3-pure.cl
index 19a23b77c..258aa8765 100644
--- a/OpenCL/m01700_a3-pure.cl
+++ b/OpenCL/m01700_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01710_a0-optimized.cl b/OpenCL/m01710_a0-optimized.cl
index 8bc710215..a5a53e831 100644
--- a/OpenCL/m01710_a0-optimized.cl
+++ b/OpenCL/m01710_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -85,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -92,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01710_a0-pure.cl b/OpenCL/m01710_a0-pure.cl
index 6d40f022d..1a4f90a2b 100644
--- a/OpenCL/m01710_a0-pure.cl
+++ b/OpenCL/m01710_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01710_a1-optimized.cl b/OpenCL/m01710_a1-optimized.cl
index 97997deaf..931142cae 100644
--- a/OpenCL/m01710_a1-optimized.cl
+++ b/OpenCL/m01710_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01710_a1-pure.cl b/OpenCL/m01710_a1-pure.cl
index aa558a08f..ac14c0b1f 100644
--- a/OpenCL/m01710_a1-pure.cl
+++ b/OpenCL/m01710_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01710_a3-optimized.cl b/OpenCL/m01710_a3-optimized.cl
index fc88c97b9..a82f949ac 100644
--- a/OpenCL/m01710_a3-optimized.cl
+++ b/OpenCL/m01710_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01710_a3-pure.cl b/OpenCL/m01710_a3-pure.cl
index cd381f8b1..f275de2fc 100644
--- a/OpenCL/m01710_a3-pure.cl
+++ b/OpenCL/m01710_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01720_a0-optimized.cl b/OpenCL/m01720_a0-optimized.cl
index d2b6624a2..c331365f1 100644
--- a/OpenCL/m01720_a0-optimized.cl
+++ b/OpenCL/m01720_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -85,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -92,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01720_a0-pure.cl b/OpenCL/m01720_a0-pure.cl
index 8744ac81a..3397c31b6 100644
--- a/OpenCL/m01720_a0-pure.cl
+++ b/OpenCL/m01720_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01720_a1-optimized.cl b/OpenCL/m01720_a1-optimized.cl
index db43b2d6f..aa93dc2c9 100644
--- a/OpenCL/m01720_a1-optimized.cl
+++ b/OpenCL/m01720_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01720_a1-pure.cl b/OpenCL/m01720_a1-pure.cl
index 895635e2d..7ee48823b 100644
--- a/OpenCL/m01720_a1-pure.cl
+++ b/OpenCL/m01720_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01720_a3-optimized.cl b/OpenCL/m01720_a3-optimized.cl
index 205d6a567..891634dd4 100644
--- a/OpenCL/m01720_a3-optimized.cl
+++ b/OpenCL/m01720_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01720_a3-pure.cl b/OpenCL/m01720_a3-pure.cl
index b4faaaed2..c12c16a75 100644
--- a/OpenCL/m01720_a3-pure.cl
+++ b/OpenCL/m01720_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01730_a0-optimized.cl b/OpenCL/m01730_a0-optimized.cl
index 990dc6712..f5da15e7f 100644
--- a/OpenCL/m01730_a0-optimized.cl
+++ b/OpenCL/m01730_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -85,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -92,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01730_a0-pure.cl b/OpenCL/m01730_a0-pure.cl
index 1f2f9946f..2e6ee476a 100644
--- a/OpenCL/m01730_a0-pure.cl
+++ b/OpenCL/m01730_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01730_a1-optimized.cl b/OpenCL/m01730_a1-optimized.cl
index 51e673e37..f3cd8d89a 100644
--- a/OpenCL/m01730_a1-optimized.cl
+++ b/OpenCL/m01730_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01730_a1-pure.cl b/OpenCL/m01730_a1-pure.cl
index e1dd5629d..105807da5 100644
--- a/OpenCL/m01730_a1-pure.cl
+++ b/OpenCL/m01730_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01730_a3-optimized.cl b/OpenCL/m01730_a3-optimized.cl
index cc9a73baa..e00e5f4ae 100644
--- a/OpenCL/m01730_a3-optimized.cl
+++ b/OpenCL/m01730_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01730_a3-pure.cl b/OpenCL/m01730_a3-pure.cl
index 119a19f31..f319d97b4 100644
--- a/OpenCL/m01730_a3-pure.cl
+++ b/OpenCL/m01730_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01740_a0-optimized.cl b/OpenCL/m01740_a0-optimized.cl
index 4e046b948..ee38662e8 100644
--- a/OpenCL/m01740_a0-optimized.cl
+++ b/OpenCL/m01740_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -85,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -92,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01740_a0-pure.cl b/OpenCL/m01740_a0-pure.cl
index 90626bed5..526a0d021 100644
--- a/OpenCL/m01740_a0-pure.cl
+++ b/OpenCL/m01740_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01740_a1-optimized.cl b/OpenCL/m01740_a1-optimized.cl
index 6c9306d76..0ae6984e0 100644
--- a/OpenCL/m01740_a1-optimized.cl
+++ b/OpenCL/m01740_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01740_a1-pure.cl b/OpenCL/m01740_a1-pure.cl
index 2c1f2fd91..dca49789a 100644
--- a/OpenCL/m01740_a1-pure.cl
+++ b/OpenCL/m01740_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01740_a3-optimized.cl b/OpenCL/m01740_a3-optimized.cl
index 077830ed1..4b7b1d3df 100644
--- a/OpenCL/m01740_a3-optimized.cl
+++ b/OpenCL/m01740_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01740_a3-pure.cl b/OpenCL/m01740_a3-pure.cl
index 298ff8eb7..f188a6108 100644
--- a/OpenCL/m01740_a3-pure.cl
+++ b/OpenCL/m01740_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01750_a0-optimized.cl b/OpenCL/m01750_a0-optimized.cl
index b8cdee6e8..de916ff90 100644
--- a/OpenCL/m01750_a0-optimized.cl
+++ b/OpenCL/m01750_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -69,22 +70,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
   u64x w2_t[4];
   u64x w3_t[4];
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ (u64x) 0x3636363636363636;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ (u64x) 0x3636363636363636;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ (u64x) 0x3636363636363636;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ (u64x) 0x3636363636363636;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ (u64x) 0x3636363636363636;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ (u64x) 0x3636363636363636;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ (u64x) 0x3636363636363636;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ (u64x) 0x3636363636363636;
-  w2_t[0] =                             (u64x) 0x3636363636363636;
-  w2_t[1] =                             (u64x) 0x3636363636363636;
-  w2_t[2] =                             (u64x) 0x3636363636363636;
-  w2_t[3] =                             (u64x) 0x3636363636363636;
-  w3_t[0] =                             (u64x) 0x3636363636363636;
-  w3_t[1] =                             (u64x) 0x3636363636363636;
-  w3_t[2] =                             (u64x) 0x3636363636363636;
-  w3_t[3] =                             (u64x) 0x3636363636363636;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x3636363636363636);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x3636363636363636);
+  w2_t[0] =                             make_u64x (0x3636363636363636);
+  w2_t[1] =                             make_u64x (0x3636363636363636);
+  w2_t[2] =                             make_u64x (0x3636363636363636);
+  w2_t[3] =                             make_u64x (0x3636363636363636);
+  w3_t[0] =                             make_u64x (0x3636363636363636);
+  w3_t[1] =                             make_u64x (0x3636363636363636);
+  w3_t[2] =                             make_u64x (0x3636363636363636);
+  w3_t[3] =                             make_u64x (0x3636363636363636);
 
   ipad[0] = SHA512M_A;
   ipad[1] = SHA512M_B;
@@ -97,22 +98,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
 
   sha512_transform_transport_vector (w0_t, w1_t, w2_t, w3_t, ipad);
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[0] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[1] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[2] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[3] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[0] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[1] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[2] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[3] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
 
   opad[0] = SHA512M_A;
   opad[1] = SHA512M_B;
diff --git a/OpenCL/m01750_a0-pure.cl b/OpenCL/m01750_a0-pure.cl
index adf938244..f41391d60 100644
--- a/OpenCL/m01750_a0-pure.cl
+++ b/OpenCL/m01750_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01750_a1-optimized.cl b/OpenCL/m01750_a1-optimized.cl
index 56693782b..6d3287176 100644
--- a/OpenCL/m01750_a1-optimized.cl
+++ b/OpenCL/m01750_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -67,22 +68,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
   u64x w2_t[4];
   u64x w3_t[4];
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ (u64x) 0x3636363636363636;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ (u64x) 0x3636363636363636;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ (u64x) 0x3636363636363636;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ (u64x) 0x3636363636363636;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ (u64x) 0x3636363636363636;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ (u64x) 0x3636363636363636;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ (u64x) 0x3636363636363636;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ (u64x) 0x3636363636363636;
-  w2_t[0] =                             (u64x) 0x3636363636363636;
-  w2_t[1] =                             (u64x) 0x3636363636363636;
-  w2_t[2] =                             (u64x) 0x3636363636363636;
-  w2_t[3] =                             (u64x) 0x3636363636363636;
-  w3_t[0] =                             (u64x) 0x3636363636363636;
-  w3_t[1] =                             (u64x) 0x3636363636363636;
-  w3_t[2] =                             (u64x) 0x3636363636363636;
-  w3_t[3] =                             (u64x) 0x3636363636363636;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x3636363636363636);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x3636363636363636);
+  w2_t[0] =                             make_u64x (0x3636363636363636);
+  w2_t[1] =                             make_u64x (0x3636363636363636);
+  w2_t[2] =                             make_u64x (0x3636363636363636);
+  w2_t[3] =                             make_u64x (0x3636363636363636);
+  w3_t[0] =                             make_u64x (0x3636363636363636);
+  w3_t[1] =                             make_u64x (0x3636363636363636);
+  w3_t[2] =                             make_u64x (0x3636363636363636);
+  w3_t[3] =                             make_u64x (0x3636363636363636);
 
   ipad[0] = SHA512M_A;
   ipad[1] = SHA512M_B;
@@ -95,22 +96,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
 
   sha512_transform_transport_vector (w0_t, w1_t, w2_t, w3_t, ipad);
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[0] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[1] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[2] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[3] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[0] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[1] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[2] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[3] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
 
   opad[0] = SHA512M_A;
   opad[1] = SHA512M_B;
diff --git a/OpenCL/m01750_a1-pure.cl b/OpenCL/m01750_a1-pure.cl
index ebc65973f..fad3ad3aa 100644
--- a/OpenCL/m01750_a1-pure.cl
+++ b/OpenCL/m01750_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01750_a3-optimized.cl b/OpenCL/m01750_a3-optimized.cl
index 05f3a62f7..99301acde 100644
--- a/OpenCL/m01750_a3-optimized.cl
+++ b/OpenCL/m01750_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -67,22 +68,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
   u64x w2_t[4];
   u64x w3_t[4];
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ 0x3636363636363636;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ 0x3636363636363636;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ 0x3636363636363636;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ 0x3636363636363636;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ 0x3636363636363636;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ 0x3636363636363636;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ 0x3636363636363636;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ 0x3636363636363636;
-  w2_t[0] =                             0x3636363636363636;
-  w2_t[1] =                             0x3636363636363636;
-  w2_t[2] =                             0x3636363636363636;
-  w2_t[3] =                             0x3636363636363636;
-  w3_t[0] =                             0x3636363636363636;
-  w3_t[1] =                             0x3636363636363636;
-  w3_t[2] =                             0x3636363636363636;
-  w3_t[3] =                             0x3636363636363636;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x3636363636363636);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x3636363636363636);
+  w2_t[0] =                             make_u64x (0x3636363636363636);
+  w2_t[1] =                             make_u64x (0x3636363636363636);
+  w2_t[2] =                             make_u64x (0x3636363636363636);
+  w2_t[3] =                             make_u64x (0x3636363636363636);
+  w3_t[0] =                             make_u64x (0x3636363636363636);
+  w3_t[1] =                             make_u64x (0x3636363636363636);
+  w3_t[2] =                             make_u64x (0x3636363636363636);
+  w3_t[3] =                             make_u64x (0x3636363636363636);
 
   ipad[0] = SHA512M_A;
   ipad[1] = SHA512M_B;
@@ -95,22 +96,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
 
   sha512_transform_transport_vector (w0_t, w1_t, w2_t, w3_t, ipad);
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ 0x5c5c5c5c5c5c5c5c;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ 0x5c5c5c5c5c5c5c5c;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ 0x5c5c5c5c5c5c5c5c;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ 0x5c5c5c5c5c5c5c5c;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ 0x5c5c5c5c5c5c5c5c;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ 0x5c5c5c5c5c5c5c5c;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ 0x5c5c5c5c5c5c5c5c;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ 0x5c5c5c5c5c5c5c5c;
-  w2_t[0] =                             0x5c5c5c5c5c5c5c5c;
-  w2_t[1] =                             0x5c5c5c5c5c5c5c5c;
-  w2_t[2] =                             0x5c5c5c5c5c5c5c5c;
-  w2_t[3] =                             0x5c5c5c5c5c5c5c5c;
-  w3_t[0] =                             0x5c5c5c5c5c5c5c5c;
-  w3_t[1] =                             0x5c5c5c5c5c5c5c5c;
-  w3_t[2] =                             0x5c5c5c5c5c5c5c5c;
-  w3_t[3] =                             0x5c5c5c5c5c5c5c5c;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
 
   opad[0] = SHA512M_A;
   opad[1] = SHA512M_B;
diff --git a/OpenCL/m01750_a3-pure.cl b/OpenCL/m01750_a3-pure.cl
index 1a2c82e24..3c5c79c1f 100644
--- a/OpenCL/m01750_a3-pure.cl
+++ b/OpenCL/m01750_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01760_a0-optimized.cl b/OpenCL/m01760_a0-optimized.cl
index 2a8fbca09..ec036ab8c 100644
--- a/OpenCL/m01760_a0-optimized.cl
+++ b/OpenCL/m01760_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -69,22 +70,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
   u64x w2_t[4];
   u64x w3_t[4];
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ (u64x) 0x3636363636363636;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ (u64x) 0x3636363636363636;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ (u64x) 0x3636363636363636;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ (u64x) 0x3636363636363636;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ (u64x) 0x3636363636363636;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ (u64x) 0x3636363636363636;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ (u64x) 0x3636363636363636;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ (u64x) 0x3636363636363636;
-  w2_t[0] =                             (u64x) 0x3636363636363636;
-  w2_t[1] =                             (u64x) 0x3636363636363636;
-  w2_t[2] =                             (u64x) 0x3636363636363636;
-  w2_t[3] =                             (u64x) 0x3636363636363636;
-  w3_t[0] =                             (u64x) 0x3636363636363636;
-  w3_t[1] =                             (u64x) 0x3636363636363636;
-  w3_t[2] =                             (u64x) 0x3636363636363636;
-  w3_t[3] =                             (u64x) 0x3636363636363636;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x3636363636363636);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x3636363636363636);
+  w2_t[0] =                             make_u64x (0x3636363636363636);
+  w2_t[1] =                             make_u64x (0x3636363636363636);
+  w2_t[2] =                             make_u64x (0x3636363636363636);
+  w2_t[3] =                             make_u64x (0x3636363636363636);
+  w3_t[0] =                             make_u64x (0x3636363636363636);
+  w3_t[1] =                             make_u64x (0x3636363636363636);
+  w3_t[2] =                             make_u64x (0x3636363636363636);
+  w3_t[3] =                             make_u64x (0x3636363636363636);
 
   ipad[0] = SHA512M_A;
   ipad[1] = SHA512M_B;
@@ -97,22 +98,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
 
   sha512_transform_transport_vector (w0_t, w1_t, w2_t, w3_t, ipad);
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[0] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[1] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[2] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[3] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[0] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[1] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[2] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[3] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
 
   opad[0] = SHA512M_A;
   opad[1] = SHA512M_B;
diff --git a/OpenCL/m01760_a0-pure.cl b/OpenCL/m01760_a0-pure.cl
index 149526465..b340a1ec9 100644
--- a/OpenCL/m01760_a0-pure.cl
+++ b/OpenCL/m01760_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m01760_a1-optimized.cl b/OpenCL/m01760_a1-optimized.cl
index 9285fd6d4..016483936 100644
--- a/OpenCL/m01760_a1-optimized.cl
+++ b/OpenCL/m01760_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -67,22 +68,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
   u64x w2_t[4];
   u64x w3_t[4];
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ (u64x) 0x3636363636363636;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ (u64x) 0x3636363636363636;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ (u64x) 0x3636363636363636;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ (u64x) 0x3636363636363636;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ (u64x) 0x3636363636363636;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ (u64x) 0x3636363636363636;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ (u64x) 0x3636363636363636;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ (u64x) 0x3636363636363636;
-  w2_t[0] =                             (u64x) 0x3636363636363636;
-  w2_t[1] =                             (u64x) 0x3636363636363636;
-  w2_t[2] =                             (u64x) 0x3636363636363636;
-  w2_t[3] =                             (u64x) 0x3636363636363636;
-  w3_t[0] =                             (u64x) 0x3636363636363636;
-  w3_t[1] =                             (u64x) 0x3636363636363636;
-  w3_t[2] =                             (u64x) 0x3636363636363636;
-  w3_t[3] =                             (u64x) 0x3636363636363636;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x3636363636363636);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x3636363636363636);
+  w2_t[0] =                             make_u64x (0x3636363636363636);
+  w2_t[1] =                             make_u64x (0x3636363636363636);
+  w2_t[2] =                             make_u64x (0x3636363636363636);
+  w2_t[3] =                             make_u64x (0x3636363636363636);
+  w3_t[0] =                             make_u64x (0x3636363636363636);
+  w3_t[1] =                             make_u64x (0x3636363636363636);
+  w3_t[2] =                             make_u64x (0x3636363636363636);
+  w3_t[3] =                             make_u64x (0x3636363636363636);
 
   ipad[0] = SHA512M_A;
   ipad[1] = SHA512M_B;
@@ -95,22 +96,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
 
   sha512_transform_transport_vector (w0_t, w1_t, w2_t, w3_t, ipad);
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[0] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[1] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[2] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[3] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[0] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[1] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[2] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[3] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
 
   opad[0] = SHA512M_A;
   opad[1] = SHA512M_B;
diff --git a/OpenCL/m01760_a1-pure.cl b/OpenCL/m01760_a1-pure.cl
index 81fe1aa1a..485606097 100644
--- a/OpenCL/m01760_a1-pure.cl
+++ b/OpenCL/m01760_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01760_a3-optimized.cl b/OpenCL/m01760_a3-optimized.cl
index ca5ecc77c..3b5a5ca13 100644
--- a/OpenCL/m01760_a3-optimized.cl
+++ b/OpenCL/m01760_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -67,22 +68,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
   u64x w2_t[4];
   u64x w3_t[4];
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ (u64x) 0x3636363636363636;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ (u64x) 0x3636363636363636;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ (u64x) 0x3636363636363636;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ (u64x) 0x3636363636363636;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ (u64x) 0x3636363636363636;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ (u64x) 0x3636363636363636;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ (u64x) 0x3636363636363636;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ (u64x) 0x3636363636363636;
-  w2_t[0] =                             (u64x) 0x3636363636363636;
-  w2_t[1] =                             (u64x) 0x3636363636363636;
-  w2_t[2] =                             (u64x) 0x3636363636363636;
-  w2_t[3] =                             (u64x) 0x3636363636363636;
-  w3_t[0] =                             (u64x) 0x3636363636363636;
-  w3_t[1] =                             (u64x) 0x3636363636363636;
-  w3_t[2] =                             (u64x) 0x3636363636363636;
-  w3_t[3] =                             (u64x) 0x3636363636363636;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x3636363636363636);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x3636363636363636);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x3636363636363636);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x3636363636363636);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x3636363636363636);
+  w2_t[0] =                             make_u64x (0x3636363636363636);
+  w2_t[1] =                             make_u64x (0x3636363636363636);
+  w2_t[2] =                             make_u64x (0x3636363636363636);
+  w2_t[3] =                             make_u64x (0x3636363636363636);
+  w3_t[0] =                             make_u64x (0x3636363636363636);
+  w3_t[1] =                             make_u64x (0x3636363636363636);
+  w3_t[2] =                             make_u64x (0x3636363636363636);
+  w3_t[3] =                             make_u64x (0x3636363636363636);
 
   ipad[0] = SHA512M_A;
   ipad[1] = SHA512M_B;
@@ -95,22 +96,22 @@ DECLSPEC void hmac_sha512_pad (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u64x *ipa
 
   sha512_transform_transport_vector (w0_t, w1_t, w2_t, w3_t, ipad);
 
-  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[0] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[1] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[2] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w2_t[3] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[0] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[1] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[2] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
-  w3_t[3] =                             (u64x) 0x5c5c5c5c5c5c5c5c;
+  w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[2] = hl32_to_64 (w1[0], w1[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w0_t[3] = hl32_to_64 (w1[2], w1[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[0] = hl32_to_64 (w2[0], w2[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w2_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[0] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[1] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[2] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
+  w3_t[3] =                             make_u64x (0x5c5c5c5c5c5c5c5c);
 
   opad[0] = SHA512M_A;
   opad[1] = SHA512M_B;
diff --git a/OpenCL/m01760_a3-pure.cl b/OpenCL/m01760_a3-pure.cl
index 1110e21da..3a59341b4 100644
--- a/OpenCL/m01760_a3-pure.cl
+++ b/OpenCL/m01760_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m01800-optimized.cl b/OpenCL/m01800-optimized.cl
index f9ec2cc02..03c55bef6 100644
--- a/OpenCL/m01800-optimized.cl
+++ b/OpenCL/m01800-optimized.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha512.cl"
 #endif
diff --git a/OpenCL/m01800-pure.cl b/OpenCL/m01800-pure.cl
index 64a705d8e..f57c22243 100644
--- a/OpenCL/m01800-pure.cl
+++ b/OpenCL/m01800-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha512.cl"
 #endif
diff --git a/OpenCL/m02000_a0-pure.cl b/OpenCL/m02000_a0-pure.cl
index fe5c937a3..d0d9258e5 100644
--- a/OpenCL/m02000_a0-pure.cl
+++ b/OpenCL/m02000_a0-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #endif
 
diff --git a/OpenCL/m02000_a1-pure.cl b/OpenCL/m02000_a1-pure.cl
index fe5c937a3..d0d9258e5 100644
--- a/OpenCL/m02000_a1-pure.cl
+++ b/OpenCL/m02000_a1-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #endif
 
diff --git a/OpenCL/m02000_a3-pure.cl b/OpenCL/m02000_a3-pure.cl
index fe5c937a3..d0d9258e5 100644
--- a/OpenCL/m02000_a3-pure.cl
+++ b/OpenCL/m02000_a3-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #endif
 
diff --git a/OpenCL/m02100-pure.cl b/OpenCL/m02100-pure.cl
index cde6b322f..0d9e0e756 100644
--- a/OpenCL/m02100-pure.cl
+++ b/OpenCL/m02100-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m02400_a0-optimized.cl b/OpenCL/m02400_a0-optimized.cl
index 471e7ada1..b030782b7 100644
--- a/OpenCL/m02400_a0-optimized.cl
+++ b/OpenCL/m02400_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m02400_a1-optimized.cl b/OpenCL/m02400_a1-optimized.cl
index 286efdaba..013184be7 100644
--- a/OpenCL/m02400_a1-optimized.cl
+++ b/OpenCL/m02400_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m02400_a3-optimized.cl b/OpenCL/m02400_a3-optimized.cl
index 66fa42907..2db8f1e56 100644
--- a/OpenCL/m02400_a3-optimized.cl
+++ b/OpenCL/m02400_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m02410_a0-optimized.cl b/OpenCL/m02410_a0-optimized.cl
index e5f277680..d6254dcc0 100644
--- a/OpenCL/m02410_a0-optimized.cl
+++ b/OpenCL/m02410_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m02410_a1-optimized.cl b/OpenCL/m02410_a1-optimized.cl
index 753461bef..594aaaf8b 100644
--- a/OpenCL/m02410_a1-optimized.cl
+++ b/OpenCL/m02410_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m02410_a3-optimized.cl b/OpenCL/m02410_a3-optimized.cl
index 76b7eb086..cfa3ae2db 100644
--- a/OpenCL/m02410_a3-optimized.cl
+++ b/OpenCL/m02410_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m02500-pure.cl b/OpenCL/m02500-pure.cl
index a4ebce4d5..95f97fb81 100644
--- a/OpenCL/m02500-pure.cl
+++ b/OpenCL/m02500-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -17,6 +18,7 @@
 #else
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_simd.h"
 #include "inc_hash_md5.h"
@@ -664,11 +666,11 @@ KERNEL_FQ void m02500_aux3 (KERN_ATTR_TMPS_ESALT (wpa_pbkdf2_tmp_t, wpa_eapol_t)
   const u64 lid = get_local_id (0);
   const u64 lsz = get_local_size (0);
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -679,7 +681,11 @@ KERNEL_FQ void m02500_aux3 (KERN_ATTR_TMPS_ESALT (wpa_pbkdf2_tmp_t, wpa_eapol_t)
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  #ifdef IS_CUDA
+  __syncthreads();
+  #else
+  SYNC_THREADS ();
+  #endif
 
   #else
 
diff --git a/OpenCL/m02501-pure.cl b/OpenCL/m02501-pure.cl
index 6da60615e..43b035e10 100644
--- a/OpenCL/m02501-pure.cl
+++ b/OpenCL/m02501-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -17,6 +18,7 @@
 #else
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_simd.h"
 #include "inc_hash_md5.h"
@@ -534,11 +536,11 @@ KERNEL_FQ void m02501_aux3 (KERN_ATTR_TMPS_ESALT (wpa_pmk_tmp_t, wpa_eapol_t))
   const u64 lid = get_local_id (0);
   const u64 lsz = get_local_size (0);
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -549,7 +551,7 @@ KERNEL_FQ void m02501_aux3 (KERN_ATTR_TMPS_ESALT (wpa_pmk_tmp_t, wpa_eapol_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m02610_a0-optimized.cl b/OpenCL/m02610_a0-optimized.cl
index df8899488..f0b9b8abb 100644
--- a/OpenCL/m02610_a0-optimized.cl
+++ b/OpenCL/m02610_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02610_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m02610_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m02610_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -325,7 +326,7 @@ KERNEL_FQ void m02610_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -336,7 +337,7 @@ KERNEL_FQ void m02610_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02610_a0-pure.cl b/OpenCL/m02610_a0-pure.cl
index 5c209f995..e8c750166 100644
--- a/OpenCL/m02610_a0-pure.cl
+++ b/OpenCL/m02610_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02610_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m02610_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m02610_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -144,7 +145,7 @@ KERNEL_FQ void m02610_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -155,7 +156,7 @@ KERNEL_FQ void m02610_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02610_a1-optimized.cl b/OpenCL/m02610_a1-optimized.cl
index ad323f53e..a32220ae6 100644
--- a/OpenCL/m02610_a1-optimized.cl
+++ b/OpenCL/m02610_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02610_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m02610_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m02610_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -382,7 +383,7 @@ KERNEL_FQ void m02610_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -393,7 +394,7 @@ KERNEL_FQ void m02610_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02610_a1-pure.cl b/OpenCL/m02610_a1-pure.cl
index 46f3808bb..8c03417cf 100644
--- a/OpenCL/m02610_a1-pure.cl
+++ b/OpenCL/m02610_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02610_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m02610_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m02610_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -140,7 +141,7 @@ KERNEL_FQ void m02610_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -151,7 +152,7 @@ KERNEL_FQ void m02610_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02610_a3-optimized.cl b/OpenCL/m02610_a3-optimized.cl
index 73af945f8..4a5099318 100644
--- a/OpenCL/m02610_a3-optimized.cl
+++ b/OpenCL/m02610_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m02610m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -605,7 +606,7 @@ KERNEL_FQ void m02610_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -616,7 +617,7 @@ KERNEL_FQ void m02610_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -675,7 +676,7 @@ KERNEL_FQ void m02610_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -686,7 +687,7 @@ KERNEL_FQ void m02610_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -745,7 +746,7 @@ KERNEL_FQ void m02610_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -756,7 +757,7 @@ KERNEL_FQ void m02610_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -815,7 +816,7 @@ KERNEL_FQ void m02610_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -826,7 +827,7 @@ KERNEL_FQ void m02610_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -885,7 +886,7 @@ KERNEL_FQ void m02610_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -896,7 +897,7 @@ KERNEL_FQ void m02610_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -955,7 +956,7 @@ KERNEL_FQ void m02610_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -966,7 +967,7 @@ KERNEL_FQ void m02610_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02610_a3-pure.cl b/OpenCL/m02610_a3-pure.cl
index 842897b54..29b0d157a 100644
--- a/OpenCL/m02610_a3-pure.cl
+++ b/OpenCL/m02610_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02610_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m02610_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m02610_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -153,7 +154,7 @@ KERNEL_FQ void m02610_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -164,7 +165,7 @@ KERNEL_FQ void m02610_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02710_a0-optimized.cl b/OpenCL/m02710_a0-optimized.cl
index cfe83ae2d..54e63878e 100644
--- a/OpenCL/m02710_a0-optimized.cl
+++ b/OpenCL/m02710_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02710_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m02710_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m02710_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -410,7 +411,7 @@ KERNEL_FQ void m02710_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -421,7 +422,7 @@ KERNEL_FQ void m02710_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02710_a1-optimized.cl b/OpenCL/m02710_a1-optimized.cl
index 89927efc9..413d5c8e9 100644
--- a/OpenCL/m02710_a1-optimized.cl
+++ b/OpenCL/m02710_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02710_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m02710_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m02710_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -467,7 +468,7 @@ KERNEL_FQ void m02710_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -478,7 +479,7 @@ KERNEL_FQ void m02710_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02710_a3-optimized.cl b/OpenCL/m02710_a3-optimized.cl
index a02f67260..8c3528c5b 100644
--- a/OpenCL/m02710_a3-optimized.cl
+++ b/OpenCL/m02710_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m02710m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -774,7 +775,7 @@ KERNEL_FQ void m02710_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -785,7 +786,7 @@ KERNEL_FQ void m02710_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -844,7 +845,7 @@ KERNEL_FQ void m02710_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -855,7 +856,7 @@ KERNEL_FQ void m02710_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -914,7 +915,7 @@ KERNEL_FQ void m02710_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -925,7 +926,7 @@ KERNEL_FQ void m02710_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -984,7 +985,7 @@ KERNEL_FQ void m02710_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -995,7 +996,7 @@ KERNEL_FQ void m02710_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1054,7 +1055,7 @@ KERNEL_FQ void m02710_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1065,7 +1066,7 @@ KERNEL_FQ void m02710_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1124,7 +1125,7 @@ KERNEL_FQ void m02710_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1135,7 +1136,7 @@ KERNEL_FQ void m02710_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02810_a0-optimized.cl b/OpenCL/m02810_a0-optimized.cl
index 25757ed87..577d620d6 100644
--- a/OpenCL/m02810_a0-optimized.cl
+++ b/OpenCL/m02810_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02810_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m02810_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m02810_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -409,7 +410,7 @@ KERNEL_FQ void m02810_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -420,7 +421,7 @@ KERNEL_FQ void m02810_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02810_a0-pure.cl b/OpenCL/m02810_a0-pure.cl
index 5e22e1b3b..bec93ce7f 100644
--- a/OpenCL/m02810_a0-pure.cl
+++ b/OpenCL/m02810_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02810_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m02810_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m02810_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -167,7 +168,7 @@ KERNEL_FQ void m02810_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -178,7 +179,7 @@ KERNEL_FQ void m02810_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02810_a1-optimized.cl b/OpenCL/m02810_a1-optimized.cl
index 081e75270..ac48acf95 100644
--- a/OpenCL/m02810_a1-optimized.cl
+++ b/OpenCL/m02810_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02810_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m02810_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m02810_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -466,7 +467,7 @@ KERNEL_FQ void m02810_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -477,7 +478,7 @@ KERNEL_FQ void m02810_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02810_a1-pure.cl b/OpenCL/m02810_a1-pure.cl
index 187fe0dfc..b6dbd3f1e 100644
--- a/OpenCL/m02810_a1-pure.cl
+++ b/OpenCL/m02810_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02810_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m02810_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m02810_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -163,7 +164,7 @@ KERNEL_FQ void m02810_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -174,7 +175,7 @@ KERNEL_FQ void m02810_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02810_a3-optimized.cl b/OpenCL/m02810_a3-optimized.cl
index 1f80d06f3..94df6ea7f 100644
--- a/OpenCL/m02810_a3-optimized.cl
+++ b/OpenCL/m02810_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m02810m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -772,7 +773,7 @@ KERNEL_FQ void m02810_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -783,7 +784,7 @@ KERNEL_FQ void m02810_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -842,7 +843,7 @@ KERNEL_FQ void m02810_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -853,7 +854,7 @@ KERNEL_FQ void m02810_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -912,7 +913,7 @@ KERNEL_FQ void m02810_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -923,7 +924,7 @@ KERNEL_FQ void m02810_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -982,7 +983,7 @@ KERNEL_FQ void m02810_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -993,7 +994,7 @@ KERNEL_FQ void m02810_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1052,7 +1053,7 @@ KERNEL_FQ void m02810_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1063,7 +1064,7 @@ KERNEL_FQ void m02810_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1122,7 +1123,7 @@ KERNEL_FQ void m02810_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1133,7 +1134,7 @@ KERNEL_FQ void m02810_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m02810_a3-pure.cl b/OpenCL/m02810_a3-pure.cl
index 0af147c6b..f6e4185fd 100644
--- a/OpenCL/m02810_a3-pure.cl
+++ b/OpenCL/m02810_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m02810_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m02810_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m02810_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -176,7 +177,7 @@ KERNEL_FQ void m02810_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -187,7 +188,7 @@ KERNEL_FQ void m02810_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03000_a0-pure.cl b/OpenCL/m03000_a0-pure.cl
index 531d294e9..f1652e046 100644
--- a/OpenCL/m03000_a0-pure.cl
+++ b/OpenCL/m03000_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -34,7 +35,7 @@
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -182,7 +183,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -336,13 +337,13 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32 *iv, u32x *data, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -505,8 +506,8 @@ KERNEL_FQ void m03000_mxx (KERN_ATTR_RULES ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -529,7 +530,7 @@ KERNEL_FQ void m03000_mxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -590,8 +591,8 @@ KERNEL_FQ void m03000_sxx (KERN_ATTR_RULES ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -614,7 +615,7 @@ KERNEL_FQ void m03000_sxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03000_a1-pure.cl b/OpenCL/m03000_a1-pure.cl
index 0e00a37ef..9b7820579 100644
--- a/OpenCL/m03000_a1-pure.cl
+++ b/OpenCL/m03000_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
@@ -32,7 +33,7 @@
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -180,7 +181,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -334,13 +335,13 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32 *iv, u32x *data, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -503,8 +504,8 @@ KERNEL_FQ void m03000_mxx (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -527,7 +528,7 @@ KERNEL_FQ void m03000_mxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -667,8 +668,8 @@ KERNEL_FQ void m03000_sxx (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -691,7 +692,7 @@ KERNEL_FQ void m03000_sxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03000_a3-pure.cl b/OpenCL/m03000_a3-pure.cl
index 81d25664a..682edabf4 100644
--- a/OpenCL/m03000_a3-pure.cl
+++ b/OpenCL/m03000_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #endif
 
@@ -1730,7 +1731,7 @@ DECLSPEC void transpose32c (u32 *data)
 // transpose bitslice mod  : attention race conditions, need different buffers for *in and *out
 //
 
-KERNEL_FQ void m03000_tm (GLOBAL_AS u32 *mod, GLOBAL_AS bs_word_t *words_buf_r)
+KERNEL_FQ void m03000_tm (GLOBAL_AS u32 *mod, GLOBAL_AS bs_word_t *words_buf_b)
 {
   const u64 gid = get_global_id (0);
 
@@ -1741,14 +1742,14 @@ KERNEL_FQ void m03000_tm (GLOBAL_AS u32 *mod, GLOBAL_AS bs_word_t *words_buf_r)
 
   for (int i = 0; i < 32; i += 8)
   {
-    atomic_or (&words_buf_r[block].b[i + 0], (((w0 >> (i + 7)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[i + 1], (((w0 >> (i + 6)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[i + 2], (((w0 >> (i + 5)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[i + 3], (((w0 >> (i + 4)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[i + 4], (((w0 >> (i + 3)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[i + 5], (((w0 >> (i + 2)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[i + 6], (((w0 >> (i + 1)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[i + 7], (((w0 >> (i + 0)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[i + 0], (((w0 >> (i + 7)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[i + 1], (((w0 >> (i + 6)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[i + 2], (((w0 >> (i + 5)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[i + 3], (((w0 >> (i + 4)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[i + 4], (((w0 >> (i + 3)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[i + 5], (((w0 >> (i + 2)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[i + 6], (((w0 >> (i + 1)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[i + 7], (((w0 >> (i + 0)) & 1) << slice));
   }
 }
 
@@ -1829,7 +1830,11 @@ KERNEL_FQ void m03000_mxx (KERN_ATTR_BITSLICE ())
    * inner loop
    */
 
+  #ifdef IS_CUDA
+  const u32 pc_pos = (blockIdx.y * blockDim.y) + threadIdx.y;
+  #else
   const u32 pc_pos = get_global_id (1);
+  #endif
 
   const u32 il_pos = pc_pos * 32;
 
@@ -1866,38 +1871,38 @@ KERNEL_FQ void m03000_mxx (KERN_ATTR_BITSLICE ())
   u32 k30 = K30;
   u32 k31 = K31;
 
-  k00 |= words_buf_r[pc_pos].b[ 0];
-  k01 |= words_buf_r[pc_pos].b[ 1];
-  k02 |= words_buf_r[pc_pos].b[ 2];
-  k03 |= words_buf_r[pc_pos].b[ 3];
-  k04 |= words_buf_r[pc_pos].b[ 4];
-  k05 |= words_buf_r[pc_pos].b[ 5];
-  k06 |= words_buf_r[pc_pos].b[ 6];
-  k07 |= words_buf_r[pc_pos].b[ 7];
-  k08 |= words_buf_r[pc_pos].b[ 8];
-  k09 |= words_buf_r[pc_pos].b[ 9];
-  k10 |= words_buf_r[pc_pos].b[10];
-  k11 |= words_buf_r[pc_pos].b[11];
-  k12 |= words_buf_r[pc_pos].b[12];
-  k13 |= words_buf_r[pc_pos].b[13];
-  k14 |= words_buf_r[pc_pos].b[14];
-  k15 |= words_buf_r[pc_pos].b[15];
-  k16 |= words_buf_r[pc_pos].b[16];
-  k17 |= words_buf_r[pc_pos].b[17];
-  k18 |= words_buf_r[pc_pos].b[18];
-  k19 |= words_buf_r[pc_pos].b[19];
-  k20 |= words_buf_r[pc_pos].b[20];
-  k21 |= words_buf_r[pc_pos].b[21];
-  k22 |= words_buf_r[pc_pos].b[22];
-  k23 |= words_buf_r[pc_pos].b[23];
-  k24 |= words_buf_r[pc_pos].b[24];
-  k25 |= words_buf_r[pc_pos].b[25];
-  k26 |= words_buf_r[pc_pos].b[26];
-  k27 |= words_buf_r[pc_pos].b[27];
-  k28 |= words_buf_r[pc_pos].b[28];
-  k29 |= words_buf_r[pc_pos].b[29];
-  k30 |= words_buf_r[pc_pos].b[30];
-  k31 |= words_buf_r[pc_pos].b[31];
+  k00 |= words_buf_s[pc_pos].b[ 0];
+  k01 |= words_buf_s[pc_pos].b[ 1];
+  k02 |= words_buf_s[pc_pos].b[ 2];
+  k03 |= words_buf_s[pc_pos].b[ 3];
+  k04 |= words_buf_s[pc_pos].b[ 4];
+  k05 |= words_buf_s[pc_pos].b[ 5];
+  k06 |= words_buf_s[pc_pos].b[ 6];
+  k07 |= words_buf_s[pc_pos].b[ 7];
+  k08 |= words_buf_s[pc_pos].b[ 8];
+  k09 |= words_buf_s[pc_pos].b[ 9];
+  k10 |= words_buf_s[pc_pos].b[10];
+  k11 |= words_buf_s[pc_pos].b[11];
+  k12 |= words_buf_s[pc_pos].b[12];
+  k13 |= words_buf_s[pc_pos].b[13];
+  k14 |= words_buf_s[pc_pos].b[14];
+  k15 |= words_buf_s[pc_pos].b[15];
+  k16 |= words_buf_s[pc_pos].b[16];
+  k17 |= words_buf_s[pc_pos].b[17];
+  k18 |= words_buf_s[pc_pos].b[18];
+  k19 |= words_buf_s[pc_pos].b[19];
+  k20 |= words_buf_s[pc_pos].b[20];
+  k21 |= words_buf_s[pc_pos].b[21];
+  k22 |= words_buf_s[pc_pos].b[22];
+  k23 |= words_buf_s[pc_pos].b[23];
+  k24 |= words_buf_s[pc_pos].b[24];
+  k25 |= words_buf_s[pc_pos].b[25];
+  k26 |= words_buf_s[pc_pos].b[26];
+  k27 |= words_buf_s[pc_pos].b[27];
+  k28 |= words_buf_s[pc_pos].b[28];
+  k29 |= words_buf_s[pc_pos].b[29];
+  k30 |= words_buf_s[pc_pos].b[30];
+  k31 |= words_buf_s[pc_pos].b[31];
 
   // KGS!@#$% including IP
 
@@ -2277,7 +2282,11 @@ KERNEL_FQ void m03000_sxx (KERN_ATTR_BITSLICE ())
    * inner loop
    */
 
+  #ifdef IS_CUDA
+  const u32 pc_pos = (blockIdx.y * blockDim.y) + threadIdx.y;
+  #else
   const u32 pc_pos = get_global_id (1);
+  #endif
 
   const u32 il_pos = pc_pos * 32;
 
@@ -2314,38 +2323,38 @@ KERNEL_FQ void m03000_sxx (KERN_ATTR_BITSLICE ())
   u32 k30 = K30;
   u32 k31 = K31;
 
-  k00 |= words_buf_r[pc_pos].b[ 0];
-  k01 |= words_buf_r[pc_pos].b[ 1];
-  k02 |= words_buf_r[pc_pos].b[ 2];
-  k03 |= words_buf_r[pc_pos].b[ 3];
-  k04 |= words_buf_r[pc_pos].b[ 4];
-  k05 |= words_buf_r[pc_pos].b[ 5];
-  k06 |= words_buf_r[pc_pos].b[ 6];
-  k07 |= words_buf_r[pc_pos].b[ 7];
-  k08 |= words_buf_r[pc_pos].b[ 8];
-  k09 |= words_buf_r[pc_pos].b[ 9];
-  k10 |= words_buf_r[pc_pos].b[10];
-  k11 |= words_buf_r[pc_pos].b[11];
-  k12 |= words_buf_r[pc_pos].b[12];
-  k13 |= words_buf_r[pc_pos].b[13];
-  k14 |= words_buf_r[pc_pos].b[14];
-  k15 |= words_buf_r[pc_pos].b[15];
-  k16 |= words_buf_r[pc_pos].b[16];
-  k17 |= words_buf_r[pc_pos].b[17];
-  k18 |= words_buf_r[pc_pos].b[18];
-  k19 |= words_buf_r[pc_pos].b[19];
-  k20 |= words_buf_r[pc_pos].b[20];
-  k21 |= words_buf_r[pc_pos].b[21];
-  k22 |= words_buf_r[pc_pos].b[22];
-  k23 |= words_buf_r[pc_pos].b[23];
-  k24 |= words_buf_r[pc_pos].b[24];
-  k25 |= words_buf_r[pc_pos].b[25];
-  k26 |= words_buf_r[pc_pos].b[26];
-  k27 |= words_buf_r[pc_pos].b[27];
-  k28 |= words_buf_r[pc_pos].b[28];
-  k29 |= words_buf_r[pc_pos].b[29];
-  k30 |= words_buf_r[pc_pos].b[30];
-  k31 |= words_buf_r[pc_pos].b[31];
+  k00 |= words_buf_s[pc_pos].b[ 0];
+  k01 |= words_buf_s[pc_pos].b[ 1];
+  k02 |= words_buf_s[pc_pos].b[ 2];
+  k03 |= words_buf_s[pc_pos].b[ 3];
+  k04 |= words_buf_s[pc_pos].b[ 4];
+  k05 |= words_buf_s[pc_pos].b[ 5];
+  k06 |= words_buf_s[pc_pos].b[ 6];
+  k07 |= words_buf_s[pc_pos].b[ 7];
+  k08 |= words_buf_s[pc_pos].b[ 8];
+  k09 |= words_buf_s[pc_pos].b[ 9];
+  k10 |= words_buf_s[pc_pos].b[10];
+  k11 |= words_buf_s[pc_pos].b[11];
+  k12 |= words_buf_s[pc_pos].b[12];
+  k13 |= words_buf_s[pc_pos].b[13];
+  k14 |= words_buf_s[pc_pos].b[14];
+  k15 |= words_buf_s[pc_pos].b[15];
+  k16 |= words_buf_s[pc_pos].b[16];
+  k17 |= words_buf_s[pc_pos].b[17];
+  k18 |= words_buf_s[pc_pos].b[18];
+  k19 |= words_buf_s[pc_pos].b[19];
+  k20 |= words_buf_s[pc_pos].b[20];
+  k21 |= words_buf_s[pc_pos].b[21];
+  k22 |= words_buf_s[pc_pos].b[22];
+  k23 |= words_buf_s[pc_pos].b[23];
+  k24 |= words_buf_s[pc_pos].b[24];
+  k25 |= words_buf_s[pc_pos].b[25];
+  k26 |= words_buf_s[pc_pos].b[26];
+  k27 |= words_buf_s[pc_pos].b[27];
+  k28 |= words_buf_s[pc_pos].b[28];
+  k29 |= words_buf_s[pc_pos].b[29];
+  k30 |= words_buf_s[pc_pos].b[30];
+  k31 |= words_buf_s[pc_pos].b[31];
 
   // KGS!@#$% including IP
 
diff --git a/OpenCL/m03100_a0-optimized.cl b/OpenCL/m03100_a0-optimized.cl
index 426594c63..524e37174 100644
--- a/OpenCL/m03100_a0-optimized.cl
+++ b/OpenCL/m03100_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -32,8 +33,8 @@ KERNEL_FQ void m03100_m04 (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -56,7 +57,7 @@ KERNEL_FQ void m03100_m04 (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -264,8 +265,8 @@ KERNEL_FQ void m03100_s04 (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -288,7 +289,7 @@ KERNEL_FQ void m03100_s04 (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m03100_a1-optimized.cl b/OpenCL/m03100_a1-optimized.cl
index d565e1530..69d4162bf 100644
--- a/OpenCL/m03100_a1-optimized.cl
+++ b/OpenCL/m03100_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_cipher_des.cl"
@@ -30,8 +31,8 @@ KERNEL_FQ void m03100_m04 (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -54,7 +55,7 @@ KERNEL_FQ void m03100_m04 (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -322,8 +323,8 @@ KERNEL_FQ void m03100_s04 (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -346,7 +347,7 @@ KERNEL_FQ void m03100_s04 (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m03100_a3-optimized.cl b/OpenCL/m03100_a3-optimized.cl
index 41fe2f1f5..b576e67b3 100644
--- a/OpenCL/m03100_a3-optimized.cl
+++ b/OpenCL/m03100_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_cipher_des.cl"
@@ -428,8 +429,8 @@ KERNEL_FQ void m03100_m04 (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -452,7 +453,7 @@ KERNEL_FQ void m03100_m04 (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -511,8 +512,8 @@ KERNEL_FQ void m03100_m08 (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -535,7 +536,7 @@ KERNEL_FQ void m03100_m08 (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -598,8 +599,8 @@ KERNEL_FQ void m03100_s04 (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -622,7 +623,7 @@ KERNEL_FQ void m03100_s04 (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -681,8 +682,8 @@ KERNEL_FQ void m03100_s08 (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -705,7 +706,7 @@ KERNEL_FQ void m03100_s08 (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m03200-pure.cl b/OpenCL/m03200-pure.cl
index 332cca11c..5f701666f 100644
--- a/OpenCL/m03200-pure.cl
+++ b/OpenCL/m03200-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #endif
 
@@ -27,7 +28,7 @@ typedef struct bcrypt_tmp
 
 // http://www.schneier.com/code/constants.txt
 
-CONSTANT_AS u32a c_sbox0[256] =
+CONSTANT_VK u32a c_sbox0[256] =
 {
   0xd1310ba6, 0x98dfb5ac, 0x2ffd72db, 0xd01adfb7,
   0xb8e1afed, 0x6a267e96, 0xba7c9045, 0xf12c7f99,
@@ -95,7 +96,7 @@ CONSTANT_AS u32a c_sbox0[256] =
   0x53b02d5d, 0xa99f8fa1, 0x08ba4799, 0x6e85076a
 };
 
-CONSTANT_AS u32a c_sbox1[256] =
+CONSTANT_VK u32a c_sbox1[256] =
 {
   0x4b7a70e9, 0xb5b32944, 0xdb75092e, 0xc4192623,
   0xad6ea6b0, 0x49a7df7d, 0x9cee60b8, 0x8fedb266,
@@ -163,7 +164,7 @@ CONSTANT_AS u32a c_sbox1[256] =
   0x153e21e7, 0x8fb03d4a, 0xe6e39f2b, 0xdb83adf7
 };
 
-CONSTANT_AS u32a c_sbox2[256] =
+CONSTANT_VK u32a c_sbox2[256] =
 {
   0xe93d5a68, 0x948140f7, 0xf64c261c, 0x94692934,
   0x411520f7, 0x7602d4f7, 0xbcf46b2e, 0xd4a20068,
@@ -231,7 +232,7 @@ CONSTANT_AS u32a c_sbox2[256] =
   0xd79a3234, 0x92638212, 0x670efa8e, 0x406000e0
 };
 
-CONSTANT_AS u32a c_sbox3[256] =
+CONSTANT_VK u32a c_sbox3[256] =
 {
   0x3a39ce37, 0xd3faf5cf, 0xabc27737, 0x5ac52d1b,
   0x5cb0679e, 0x4fa33742, 0xd3822740, 0x99bc9bbe,
@@ -443,10 +444,10 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03
    * do the key setup
    */
 
-  LOCAL_AS u32 S0_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S1_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S2_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S3_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256];
 
   LOCAL_AS u32 *S0 = S0_all[lid];
   LOCAL_AS u32 *S1 = S1_all[lid];
@@ -612,10 +613,10 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03
     P[i] = tmps[gid].P[i];
   }
 
-  LOCAL_AS u32 S0_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S1_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S2_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S3_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256];
 
   LOCAL_AS u32 *S0 = S0_all[lid];
   LOCAL_AS u32 *S1 = S1_all[lid];
@@ -797,10 +798,10 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03
     P[i] = tmps[gid].P[i];
   }
 
-  LOCAL_AS u32 S0_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S1_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S2_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S3_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256];
 
   LOCAL_AS u32 *S0 = S0_all[lid];
   LOCAL_AS u32 *S1 = S1_all[lid];
diff --git a/OpenCL/m03710_a0-optimized.cl b/OpenCL/m03710_a0-optimized.cl
index 13f534ec4..74ae66a6e 100644
--- a/OpenCL/m03710_a0-optimized.cl
+++ b/OpenCL/m03710_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m03710_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m03710_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m03710_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -356,7 +357,7 @@ KERNEL_FQ void m03710_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -367,7 +368,7 @@ KERNEL_FQ void m03710_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03710_a0-pure.cl b/OpenCL/m03710_a0-pure.cl
index 600e2bfbd..a8712e610 100644
--- a/OpenCL/m03710_a0-pure.cl
+++ b/OpenCL/m03710_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m03710_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m03710_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m03710_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -157,7 +158,7 @@ KERNEL_FQ void m03710_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -168,7 +169,7 @@ KERNEL_FQ void m03710_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03710_a1-optimized.cl b/OpenCL/m03710_a1-optimized.cl
index 6a09adc8d..633de2cc2 100644
--- a/OpenCL/m03710_a1-optimized.cl
+++ b/OpenCL/m03710_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m03710_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m03710_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m03710_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -413,7 +414,7 @@ KERNEL_FQ void m03710_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -424,7 +425,7 @@ KERNEL_FQ void m03710_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03710_a1-pure.cl b/OpenCL/m03710_a1-pure.cl
index 334a09bdb..c77e07919 100644
--- a/OpenCL/m03710_a1-pure.cl
+++ b/OpenCL/m03710_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m03710_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m03710_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m03710_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -153,7 +154,7 @@ KERNEL_FQ void m03710_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -164,7 +165,7 @@ KERNEL_FQ void m03710_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03710_a3-optimized.cl b/OpenCL/m03710_a3-optimized.cl
index 6feff2427..27198dba5 100644
--- a/OpenCL/m03710_a3-optimized.cl
+++ b/OpenCL/m03710_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m03710m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -632,7 +633,7 @@ KERNEL_FQ void m03710_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -643,7 +644,7 @@ KERNEL_FQ void m03710_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -736,7 +737,7 @@ KERNEL_FQ void m03710_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -747,7 +748,7 @@ KERNEL_FQ void m03710_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -806,7 +807,7 @@ KERNEL_FQ void m03710_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -817,7 +818,7 @@ KERNEL_FQ void m03710_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -876,7 +877,7 @@ KERNEL_FQ void m03710_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -887,7 +888,7 @@ KERNEL_FQ void m03710_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -946,7 +947,7 @@ KERNEL_FQ void m03710_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -957,7 +958,7 @@ KERNEL_FQ void m03710_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -982,7 +983,7 @@ KERNEL_FQ void m03710_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -993,7 +994,7 @@ KERNEL_FQ void m03710_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03710_a3-pure.cl b/OpenCL/m03710_a3-pure.cl
index b9c61f98f..153265207 100644
--- a/OpenCL/m03710_a3-pure.cl
+++ b/OpenCL/m03710_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m03710_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m03710_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m03710_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -166,7 +167,7 @@ KERNEL_FQ void m03710_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -177,7 +178,7 @@ KERNEL_FQ void m03710_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03800_a0-optimized.cl b/OpenCL/m03800_a0-optimized.cl
index b14fd1569..12cfd7139 100644
--- a/OpenCL/m03800_a0-optimized.cl
+++ b/OpenCL/m03800_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m03800_a0-pure.cl b/OpenCL/m03800_a0-pure.cl
index 4bcc387b9..3fa962f59 100644
--- a/OpenCL/m03800_a0-pure.cl
+++ b/OpenCL/m03800_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m03800_a1-optimized.cl b/OpenCL/m03800_a1-optimized.cl
index 274f3e35d..28d317f5f 100644
--- a/OpenCL/m03800_a1-optimized.cl
+++ b/OpenCL/m03800_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m03800_a1-pure.cl b/OpenCL/m03800_a1-pure.cl
index e0b3fbf2d..ae1dd719b 100644
--- a/OpenCL/m03800_a1-pure.cl
+++ b/OpenCL/m03800_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m03800_a3-optimized.cl b/OpenCL/m03800_a3-optimized.cl
index 5fbcffba0..e849f703e 100644
--- a/OpenCL/m03800_a3-optimized.cl
+++ b/OpenCL/m03800_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m03800_a3-pure.cl b/OpenCL/m03800_a3-pure.cl
index 4f8bf236b..e8189ccea 100644
--- a/OpenCL/m03800_a3-pure.cl
+++ b/OpenCL/m03800_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m03910_a0-optimized.cl b/OpenCL/m03910_a0-optimized.cl
index 38346dea8..0b052fdc3 100644
--- a/OpenCL/m03910_a0-optimized.cl
+++ b/OpenCL/m03910_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m03910_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m03910_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m03910_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -409,7 +410,7 @@ KERNEL_FQ void m03910_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -420,7 +421,7 @@ KERNEL_FQ void m03910_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03910_a0-pure.cl b/OpenCL/m03910_a0-pure.cl
index 9cebaca44..ff8474f46 100644
--- a/OpenCL/m03910_a0-pure.cl
+++ b/OpenCL/m03910_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m03910_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m03910_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m03910_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -167,7 +168,7 @@ KERNEL_FQ void m03910_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -178,7 +179,7 @@ KERNEL_FQ void m03910_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03910_a1-optimized.cl b/OpenCL/m03910_a1-optimized.cl
index d27fbdd63..57abab584 100644
--- a/OpenCL/m03910_a1-optimized.cl
+++ b/OpenCL/m03910_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m03910_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m03910_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m03910_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -466,7 +467,7 @@ KERNEL_FQ void m03910_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -477,7 +478,7 @@ KERNEL_FQ void m03910_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03910_a1-pure.cl b/OpenCL/m03910_a1-pure.cl
index 3c3706a99..245041c93 100644
--- a/OpenCL/m03910_a1-pure.cl
+++ b/OpenCL/m03910_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m03910_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m03910_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m03910_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -163,7 +164,7 @@ KERNEL_FQ void m03910_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -174,7 +175,7 @@ KERNEL_FQ void m03910_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03910_a3-optimized.cl b/OpenCL/m03910_a3-optimized.cl
index aeacff0e7..7c2d11816 100644
--- a/OpenCL/m03910_a3-optimized.cl
+++ b/OpenCL/m03910_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m03910m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -772,7 +773,7 @@ KERNEL_FQ void m03910_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -783,7 +784,7 @@ KERNEL_FQ void m03910_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -842,7 +843,7 @@ KERNEL_FQ void m03910_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -853,7 +854,7 @@ KERNEL_FQ void m03910_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -912,7 +913,7 @@ KERNEL_FQ void m03910_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -923,7 +924,7 @@ KERNEL_FQ void m03910_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -982,7 +983,7 @@ KERNEL_FQ void m03910_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -993,7 +994,7 @@ KERNEL_FQ void m03910_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1052,7 +1053,7 @@ KERNEL_FQ void m03910_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1063,7 +1064,7 @@ KERNEL_FQ void m03910_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1122,7 +1123,7 @@ KERNEL_FQ void m03910_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1133,7 +1134,7 @@ KERNEL_FQ void m03910_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m03910_a3-pure.cl b/OpenCL/m03910_a3-pure.cl
index b4c269e4c..31b13ca35 100644
--- a/OpenCL/m03910_a3-pure.cl
+++ b/OpenCL/m03910_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m03910_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m03910_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m03910_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -176,7 +177,7 @@ KERNEL_FQ void m03910_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -187,7 +188,7 @@ KERNEL_FQ void m03910_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04010_a0-optimized.cl b/OpenCL/m04010_a0-optimized.cl
index efe2e8074..96909c9b6 100644
--- a/OpenCL/m04010_a0-optimized.cl
+++ b/OpenCL/m04010_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04010_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m04010_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m04010_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -382,7 +383,7 @@ KERNEL_FQ void m04010_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -393,7 +394,7 @@ KERNEL_FQ void m04010_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04010_a0-pure.cl b/OpenCL/m04010_a0-pure.cl
index 0eab0a63b..4f81378c5 100644
--- a/OpenCL/m04010_a0-pure.cl
+++ b/OpenCL/m04010_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04010_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m04010_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m04010_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -148,7 +149,7 @@ KERNEL_FQ void m04010_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -159,7 +160,7 @@ KERNEL_FQ void m04010_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04010_a1-optimized.cl b/OpenCL/m04010_a1-optimized.cl
index 2d9d5ac11..e7b0eae16 100644
--- a/OpenCL/m04010_a1-optimized.cl
+++ b/OpenCL/m04010_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04010_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04010_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04010_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -438,7 +439,7 @@ KERNEL_FQ void m04010_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -449,7 +450,7 @@ KERNEL_FQ void m04010_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04010_a1-pure.cl b/OpenCL/m04010_a1-pure.cl
index 66d5c9f55..60eac950f 100644
--- a/OpenCL/m04010_a1-pure.cl
+++ b/OpenCL/m04010_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04010_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04010_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04010_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -144,7 +145,7 @@ KERNEL_FQ void m04010_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -155,7 +156,7 @@ KERNEL_FQ void m04010_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04010_a3-optimized.cl b/OpenCL/m04010_a3-optimized.cl
index 34d678766..fab737d45 100644
--- a/OpenCL/m04010_a3-optimized.cl
+++ b/OpenCL/m04010_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m04010m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -672,7 +673,7 @@ KERNEL_FQ void m04010_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -683,7 +684,7 @@ KERNEL_FQ void m04010_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -776,7 +777,7 @@ KERNEL_FQ void m04010_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -787,7 +788,7 @@ KERNEL_FQ void m04010_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -846,7 +847,7 @@ KERNEL_FQ void m04010_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -857,7 +858,7 @@ KERNEL_FQ void m04010_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -916,7 +917,7 @@ KERNEL_FQ void m04010_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -927,7 +928,7 @@ KERNEL_FQ void m04010_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -986,7 +987,7 @@ KERNEL_FQ void m04010_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -997,7 +998,7 @@ KERNEL_FQ void m04010_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1022,7 +1023,7 @@ KERNEL_FQ void m04010_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1033,7 +1034,7 @@ KERNEL_FQ void m04010_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04010_a3-pure.cl b/OpenCL/m04010_a3-pure.cl
index 2f6882a17..bc5fddc8d 100644
--- a/OpenCL/m04010_a3-pure.cl
+++ b/OpenCL/m04010_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04010_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04010_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04010_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -161,7 +162,7 @@ KERNEL_FQ void m04010_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -172,7 +173,7 @@ KERNEL_FQ void m04010_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04110_a0-optimized.cl b/OpenCL/m04110_a0-optimized.cl
index 77d12bc03..9b32de16c 100644
--- a/OpenCL/m04110_a0-optimized.cl
+++ b/OpenCL/m04110_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04110_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m04110_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m04110_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -427,7 +428,7 @@ KERNEL_FQ void m04110_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -438,7 +439,7 @@ KERNEL_FQ void m04110_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04110_a0-pure.cl b/OpenCL/m04110_a0-pure.cl
index 8347442e3..12719e888 100644
--- a/OpenCL/m04110_a0-pure.cl
+++ b/OpenCL/m04110_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04110_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m04110_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m04110_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -161,7 +162,7 @@ KERNEL_FQ void m04110_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -172,7 +173,7 @@ KERNEL_FQ void m04110_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04110_a1-optimized.cl b/OpenCL/m04110_a1-optimized.cl
index 36bb2b2db..be22bd1ea 100644
--- a/OpenCL/m04110_a1-optimized.cl
+++ b/OpenCL/m04110_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04110_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04110_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04110_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -485,7 +486,7 @@ KERNEL_FQ void m04110_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -496,7 +497,7 @@ KERNEL_FQ void m04110_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04110_a1-pure.cl b/OpenCL/m04110_a1-pure.cl
index 9d9273f2a..2a2322a71 100644
--- a/OpenCL/m04110_a1-pure.cl
+++ b/OpenCL/m04110_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04110_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04110_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04110_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -157,7 +158,7 @@ KERNEL_FQ void m04110_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -168,7 +169,7 @@ KERNEL_FQ void m04110_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04110_a3-optimized.cl b/OpenCL/m04110_a3-optimized.cl
index c9dd1fcc5..1dcd04b03 100644
--- a/OpenCL/m04110_a3-optimized.cl
+++ b/OpenCL/m04110_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m04110m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -728,7 +729,7 @@ KERNEL_FQ void m04110_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -739,7 +740,7 @@ KERNEL_FQ void m04110_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -832,7 +833,7 @@ KERNEL_FQ void m04110_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -843,7 +844,7 @@ KERNEL_FQ void m04110_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -902,7 +903,7 @@ KERNEL_FQ void m04110_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -913,7 +914,7 @@ KERNEL_FQ void m04110_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -972,7 +973,7 @@ KERNEL_FQ void m04110_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -983,7 +984,7 @@ KERNEL_FQ void m04110_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1042,7 +1043,7 @@ KERNEL_FQ void m04110_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1053,7 +1054,7 @@ KERNEL_FQ void m04110_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1078,7 +1079,7 @@ KERNEL_FQ void m04110_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1089,7 +1090,7 @@ KERNEL_FQ void m04110_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04110_a3-pure.cl b/OpenCL/m04110_a3-pure.cl
index 4c74ccbd2..490ecd667 100644
--- a/OpenCL/m04110_a3-pure.cl
+++ b/OpenCL/m04110_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04110_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04110_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04110_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -172,7 +173,7 @@ KERNEL_FQ void m04110_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -183,7 +184,7 @@ KERNEL_FQ void m04110_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04310_a0-optimized.cl b/OpenCL/m04310_a0-optimized.cl
index dc693f19c..b8f7c10e7 100644
--- a/OpenCL/m04310_a0-optimized.cl
+++ b/OpenCL/m04310_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04310_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m04310_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m04310_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -325,7 +326,7 @@ KERNEL_FQ void m04310_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -336,7 +337,7 @@ KERNEL_FQ void m04310_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04310_a0-pure.cl b/OpenCL/m04310_a0-pure.cl
index 0a0230892..67496d227 100644
--- a/OpenCL/m04310_a0-pure.cl
+++ b/OpenCL/m04310_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04310_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m04310_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m04310_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -144,7 +145,7 @@ KERNEL_FQ void m04310_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -155,7 +156,7 @@ KERNEL_FQ void m04310_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04310_a1-optimized.cl b/OpenCL/m04310_a1-optimized.cl
index cc0b41619..a78a21694 100644
--- a/OpenCL/m04310_a1-optimized.cl
+++ b/OpenCL/m04310_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04310_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04310_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04310_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -382,7 +383,7 @@ KERNEL_FQ void m04310_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -393,7 +394,7 @@ KERNEL_FQ void m04310_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04310_a1-pure.cl b/OpenCL/m04310_a1-pure.cl
index 2ce1d1fa7..6b219b473 100644
--- a/OpenCL/m04310_a1-pure.cl
+++ b/OpenCL/m04310_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04310_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04310_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04310_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -140,7 +141,7 @@ KERNEL_FQ void m04310_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -151,7 +152,7 @@ KERNEL_FQ void m04310_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04310_a3-optimized.cl b/OpenCL/m04310_a3-optimized.cl
index 159bf0f4a..4a8572373 100644
--- a/OpenCL/m04310_a3-optimized.cl
+++ b/OpenCL/m04310_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m04310m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -605,7 +606,7 @@ KERNEL_FQ void m04310_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -616,7 +617,7 @@ KERNEL_FQ void m04310_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -675,7 +676,7 @@ KERNEL_FQ void m04310_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -686,7 +687,7 @@ KERNEL_FQ void m04310_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -745,7 +746,7 @@ KERNEL_FQ void m04310_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -756,7 +757,7 @@ KERNEL_FQ void m04310_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -815,7 +816,7 @@ KERNEL_FQ void m04310_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -826,7 +827,7 @@ KERNEL_FQ void m04310_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -885,7 +886,7 @@ KERNEL_FQ void m04310_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -896,7 +897,7 @@ KERNEL_FQ void m04310_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -955,7 +956,7 @@ KERNEL_FQ void m04310_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -966,7 +967,7 @@ KERNEL_FQ void m04310_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04310_a3-pure.cl b/OpenCL/m04310_a3-pure.cl
index f291aecb6..cd83bb46a 100644
--- a/OpenCL/m04310_a3-pure.cl
+++ b/OpenCL/m04310_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04310_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04310_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04310_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -153,7 +154,7 @@ KERNEL_FQ void m04310_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -164,7 +165,7 @@ KERNEL_FQ void m04310_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04400_a0-optimized.cl b/OpenCL/m04400_a0-optimized.cl
index ffad3ae09..b192a5859 100644
--- a/OpenCL/m04400_a0-optimized.cl
+++ b/OpenCL/m04400_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -17,15 +18,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04400_m04 (KERN_ATTR_RULES ())
@@ -42,7 +43,7 @@ KERNEL_FQ void m04400_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -53,7 +54,7 @@ KERNEL_FQ void m04400_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -352,7 +353,7 @@ KERNEL_FQ void m04400_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -363,7 +364,7 @@ KERNEL_FQ void m04400_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04400_a0-pure.cl b/OpenCL/m04400_a0-pure.cl
index 72718568e..35ca3e386 100644
--- a/OpenCL/m04400_a0-pure.cl
+++ b/OpenCL/m04400_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -17,15 +18,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04400_mxx (KERN_ATTR_RULES ())
@@ -42,7 +43,7 @@ KERNEL_FQ void m04400_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -53,7 +54,7 @@ KERNEL_FQ void m04400_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -139,7 +140,7 @@ KERNEL_FQ void m04400_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -150,7 +151,7 @@ KERNEL_FQ void m04400_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04400_a1-optimized.cl b/OpenCL/m04400_a1-optimized.cl
index 795178874..cbf0cfdf5 100644
--- a/OpenCL/m04400_a1-optimized.cl
+++ b/OpenCL/m04400_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04400_m04 (KERN_ATTR_BASIC ())
@@ -40,7 +41,7 @@ KERNEL_FQ void m04400_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -51,7 +52,7 @@ KERNEL_FQ void m04400_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -408,7 +409,7 @@ KERNEL_FQ void m04400_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -419,7 +420,7 @@ KERNEL_FQ void m04400_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04400_a1-pure.cl b/OpenCL/m04400_a1-pure.cl
index 9404b95fc..7247f7b14 100644
--- a/OpenCL/m04400_a1-pure.cl
+++ b/OpenCL/m04400_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04400_mxx (KERN_ATTR_BASIC ())
@@ -40,7 +41,7 @@ KERNEL_FQ void m04400_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -51,7 +52,7 @@ KERNEL_FQ void m04400_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -135,7 +136,7 @@ KERNEL_FQ void m04400_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -146,7 +147,7 @@ KERNEL_FQ void m04400_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04400_a3-optimized.cl b/OpenCL/m04400_a3-optimized.cl
index 33224fc61..72aea5c55 100644
--- a/OpenCL/m04400_a3-optimized.cl
+++ b/OpenCL/m04400_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m04400m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -577,7 +578,7 @@ KERNEL_FQ void m04400_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -588,7 +589,7 @@ KERNEL_FQ void m04400_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -647,7 +648,7 @@ KERNEL_FQ void m04400_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -658,7 +659,7 @@ KERNEL_FQ void m04400_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -717,7 +718,7 @@ KERNEL_FQ void m04400_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -728,7 +729,7 @@ KERNEL_FQ void m04400_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -787,7 +788,7 @@ KERNEL_FQ void m04400_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -798,7 +799,7 @@ KERNEL_FQ void m04400_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -857,7 +858,7 @@ KERNEL_FQ void m04400_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -868,7 +869,7 @@ KERNEL_FQ void m04400_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -927,7 +928,7 @@ KERNEL_FQ void m04400_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -938,7 +939,7 @@ KERNEL_FQ void m04400_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04400_a3-pure.cl b/OpenCL/m04400_a3-pure.cl
index 213a956ef..e3eb903f6 100644
--- a/OpenCL/m04400_a3-pure.cl
+++ b/OpenCL/m04400_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04400_mxx (KERN_ATTR_VECTOR ())
@@ -40,7 +41,7 @@ KERNEL_FQ void m04400_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -51,7 +52,7 @@ KERNEL_FQ void m04400_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -148,7 +149,7 @@ KERNEL_FQ void m04400_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -159,7 +160,7 @@ KERNEL_FQ void m04400_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04500_a0-optimized.cl b/OpenCL/m04500_a0-optimized.cl
index 573e50223..ba71bbcb8 100644
--- a/OpenCL/m04500_a0-optimized.cl
+++ b/OpenCL/m04500_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04500_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m04500_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m04500_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -378,7 +379,7 @@ KERNEL_FQ void m04500_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -389,7 +390,7 @@ KERNEL_FQ void m04500_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04500_a0-pure.cl b/OpenCL/m04500_a0-pure.cl
index 1dfa8b061..b3f207eaa 100644
--- a/OpenCL/m04500_a0-pure.cl
+++ b/OpenCL/m04500_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04500_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m04500_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m04500_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -138,7 +139,7 @@ KERNEL_FQ void m04500_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -149,7 +150,7 @@ KERNEL_FQ void m04500_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04500_a1-optimized.cl b/OpenCL/m04500_a1-optimized.cl
index 05d593d27..6e58bb67a 100644
--- a/OpenCL/m04500_a1-optimized.cl
+++ b/OpenCL/m04500_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04500_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04500_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04500_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -434,7 +435,7 @@ KERNEL_FQ void m04500_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -445,7 +446,7 @@ KERNEL_FQ void m04500_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04500_a1-pure.cl b/OpenCL/m04500_a1-pure.cl
index 32db391dd..32b79fb03 100644
--- a/OpenCL/m04500_a1-pure.cl
+++ b/OpenCL/m04500_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04500_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04500_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04500_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -134,7 +135,7 @@ KERNEL_FQ void m04500_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -145,7 +146,7 @@ KERNEL_FQ void m04500_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04500_a3-optimized.cl b/OpenCL/m04500_a3-optimized.cl
index 1be7a4321..0c79696db 100644
--- a/OpenCL/m04500_a3-optimized.cl
+++ b/OpenCL/m04500_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m04500m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -636,7 +637,7 @@ KERNEL_FQ void m04500_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -647,7 +648,7 @@ KERNEL_FQ void m04500_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -706,7 +707,7 @@ KERNEL_FQ void m04500_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -717,7 +718,7 @@ KERNEL_FQ void m04500_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -776,7 +777,7 @@ KERNEL_FQ void m04500_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -787,7 +788,7 @@ KERNEL_FQ void m04500_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -846,7 +847,7 @@ KERNEL_FQ void m04500_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -857,7 +858,7 @@ KERNEL_FQ void m04500_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -916,7 +917,7 @@ KERNEL_FQ void m04500_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -927,7 +928,7 @@ KERNEL_FQ void m04500_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -986,7 +987,7 @@ KERNEL_FQ void m04500_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -997,7 +998,7 @@ KERNEL_FQ void m04500_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04500_a3-pure.cl b/OpenCL/m04500_a3-pure.cl
index 086e72121..735f512c7 100644
--- a/OpenCL/m04500_a3-pure.cl
+++ b/OpenCL/m04500_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04500_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04500_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04500_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -147,7 +148,7 @@ KERNEL_FQ void m04500_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -158,7 +159,7 @@ KERNEL_FQ void m04500_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04520_a0-optimized.cl b/OpenCL/m04520_a0-optimized.cl
index a34bca23c..ffea45f02 100644
--- a/OpenCL/m04520_a0-optimized.cl
+++ b/OpenCL/m04520_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04520_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m04520_m04 (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m04520_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -609,7 +610,7 @@ KERNEL_FQ void m04520_s04 (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -620,7 +621,7 @@ KERNEL_FQ void m04520_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04520_a0-pure.cl b/OpenCL/m04520_a0-pure.cl
index b53826e38..1c3aa88ea 100644
--- a/OpenCL/m04520_a0-pure.cl
+++ b/OpenCL/m04520_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04520_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m04520_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m04520_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -153,7 +154,7 @@ KERNEL_FQ void m04520_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -164,7 +165,7 @@ KERNEL_FQ void m04520_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04520_a1-optimized.cl b/OpenCL/m04520_a1-optimized.cl
index 955343e5c..5758a6bd8 100644
--- a/OpenCL/m04520_a1-optimized.cl
+++ b/OpenCL/m04520_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04520_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04520_m04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04520_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -665,7 +666,7 @@ KERNEL_FQ void m04520_s04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -676,7 +677,7 @@ KERNEL_FQ void m04520_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04520_a1-pure.cl b/OpenCL/m04520_a1-pure.cl
index 54179adab..1ef295b06 100644
--- a/OpenCL/m04520_a1-pure.cl
+++ b/OpenCL/m04520_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04520_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04520_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04520_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -149,7 +150,7 @@ KERNEL_FQ void m04520_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -160,7 +161,7 @@ KERNEL_FQ void m04520_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04520_a3-optimized.cl b/OpenCL/m04520_a3-optimized.cl
index c3d1a96d3..0a40509bb 100644
--- a/OpenCL/m04520_a3-optimized.cl
+++ b/OpenCL/m04520_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m04520m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -1089,7 +1090,7 @@ KERNEL_FQ void m04520_m04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1100,7 +1101,7 @@ KERNEL_FQ void m04520_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1159,7 +1160,7 @@ KERNEL_FQ void m04520_m08 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1170,7 +1171,7 @@ KERNEL_FQ void m04520_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1229,7 +1230,7 @@ KERNEL_FQ void m04520_m16 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1240,7 +1241,7 @@ KERNEL_FQ void m04520_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1299,7 +1300,7 @@ KERNEL_FQ void m04520_s04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1310,7 +1311,7 @@ KERNEL_FQ void m04520_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1369,7 +1370,7 @@ KERNEL_FQ void m04520_s08 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1380,7 +1381,7 @@ KERNEL_FQ void m04520_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1439,7 +1440,7 @@ KERNEL_FQ void m04520_s16 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1450,7 +1451,7 @@ KERNEL_FQ void m04520_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04520_a3-pure.cl b/OpenCL/m04520_a3-pure.cl
index c1e49123c..8f4564d77 100644
--- a/OpenCL/m04520_a3-pure.cl
+++ b/OpenCL/m04520_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04520_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m04520_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m04520_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -164,7 +165,7 @@ KERNEL_FQ void m04520_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -175,7 +176,7 @@ KERNEL_FQ void m04520_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04700_a0-optimized.cl b/OpenCL/m04700_a0-optimized.cl
index 13a2b98c8..e6d1a2468 100644
--- a/OpenCL/m04700_a0-optimized.cl
+++ b/OpenCL/m04700_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -17,15 +18,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04700_m04 (KERN_ATTR_RULES ())
@@ -42,7 +43,7 @@ KERNEL_FQ void m04700_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -53,7 +54,7 @@ KERNEL_FQ void m04700_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -335,7 +336,7 @@ KERNEL_FQ void m04700_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -346,7 +347,7 @@ KERNEL_FQ void m04700_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04700_a0-pure.cl b/OpenCL/m04700_a0-pure.cl
index 056a4bab3..9c586f214 100644
--- a/OpenCL/m04700_a0-pure.cl
+++ b/OpenCL/m04700_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -17,15 +18,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04700_mxx (KERN_ATTR_RULES ())
@@ -42,7 +43,7 @@ KERNEL_FQ void m04700_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -53,7 +54,7 @@ KERNEL_FQ void m04700_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -134,7 +135,7 @@ KERNEL_FQ void m04700_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -145,7 +146,7 @@ KERNEL_FQ void m04700_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04700_a1-optimized.cl b/OpenCL/m04700_a1-optimized.cl
index 5f590df8c..f3989d858 100644
--- a/OpenCL/m04700_a1-optimized.cl
+++ b/OpenCL/m04700_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04700_m04 (KERN_ATTR_BASIC ())
@@ -40,7 +41,7 @@ KERNEL_FQ void m04700_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -51,7 +52,7 @@ KERNEL_FQ void m04700_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -388,7 +389,7 @@ KERNEL_FQ void m04700_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -399,7 +400,7 @@ KERNEL_FQ void m04700_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04700_a1-pure.cl b/OpenCL/m04700_a1-pure.cl
index 357bf7a78..93ae560bf 100644
--- a/OpenCL/m04700_a1-pure.cl
+++ b/OpenCL/m04700_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04700_mxx (KERN_ATTR_BASIC ())
@@ -40,7 +41,7 @@ KERNEL_FQ void m04700_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -51,7 +52,7 @@ KERNEL_FQ void m04700_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -130,7 +131,7 @@ KERNEL_FQ void m04700_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -141,7 +142,7 @@ KERNEL_FQ void m04700_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04700_a3-optimized.cl b/OpenCL/m04700_a3-optimized.cl
index d1bf72589..4b22f8887 100644
--- a/OpenCL/m04700_a3-optimized.cl
+++ b/OpenCL/m04700_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m04700m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -577,7 +578,7 @@ KERNEL_FQ void m04700_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -588,7 +589,7 @@ KERNEL_FQ void m04700_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -647,7 +648,7 @@ KERNEL_FQ void m04700_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -658,7 +659,7 @@ KERNEL_FQ void m04700_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -717,7 +718,7 @@ KERNEL_FQ void m04700_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -728,7 +729,7 @@ KERNEL_FQ void m04700_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -787,7 +788,7 @@ KERNEL_FQ void m04700_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -798,7 +799,7 @@ KERNEL_FQ void m04700_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -857,7 +858,7 @@ KERNEL_FQ void m04700_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -868,7 +869,7 @@ KERNEL_FQ void m04700_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -927,7 +928,7 @@ KERNEL_FQ void m04700_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -938,7 +939,7 @@ KERNEL_FQ void m04700_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04700_a3-pure.cl b/OpenCL/m04700_a3-pure.cl
index 29ca21e72..9e6a668ef 100644
--- a/OpenCL/m04700_a3-pure.cl
+++ b/OpenCL/m04700_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04700_mxx (KERN_ATTR_VECTOR ())
@@ -40,7 +41,7 @@ KERNEL_FQ void m04700_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -51,7 +52,7 @@ KERNEL_FQ void m04700_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -143,7 +144,7 @@ KERNEL_FQ void m04700_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -154,7 +155,7 @@ KERNEL_FQ void m04700_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m04800_a0-optimized.cl b/OpenCL/m04800_a0-optimized.cl
index 03418d5d4..c58d89a00 100644
--- a/OpenCL/m04800_a0-optimized.cl
+++ b/OpenCL/m04800_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m04800_a0-pure.cl b/OpenCL/m04800_a0-pure.cl
index f3c7646f0..54c0bf973 100644
--- a/OpenCL/m04800_a0-pure.cl
+++ b/OpenCL/m04800_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m04800_a1-optimized.cl b/OpenCL/m04800_a1-optimized.cl
index b31486ccc..4e2e0fe07 100644
--- a/OpenCL/m04800_a1-optimized.cl
+++ b/OpenCL/m04800_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m04800_m04 (KERN_ATTR_BASIC ())
diff --git a/OpenCL/m04800_a1-pure.cl b/OpenCL/m04800_a1-pure.cl
index 75f493f7a..2a0de51c9 100644
--- a/OpenCL/m04800_a1-pure.cl
+++ b/OpenCL/m04800_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m04800_a3-optimized.cl b/OpenCL/m04800_a3-optimized.cl
index 5da3ee496..98343c1dd 100644
--- a/OpenCL/m04800_a3-optimized.cl
+++ b/OpenCL/m04800_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m04800_a3-pure.cl b/OpenCL/m04800_a3-pure.cl
index 43284c7af..a7a3fcf7b 100644
--- a/OpenCL/m04800_a3-pure.cl
+++ b/OpenCL/m04800_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m04900_a0-optimized.cl b/OpenCL/m04900_a0-optimized.cl
index 601b5b1d9..d1e686ac5 100644
--- a/OpenCL/m04900_a0-optimized.cl
+++ b/OpenCL/m04900_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m04900_a0-pure.cl b/OpenCL/m04900_a0-pure.cl
index ef31db611..cab847cbf 100644
--- a/OpenCL/m04900_a0-pure.cl
+++ b/OpenCL/m04900_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m04900_a1-optimized.cl b/OpenCL/m04900_a1-optimized.cl
index e5be3e814..d51a5f407 100644
--- a/OpenCL/m04900_a1-optimized.cl
+++ b/OpenCL/m04900_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m04900_a1-pure.cl b/OpenCL/m04900_a1-pure.cl
index acd27d749..357ba67bc 100644
--- a/OpenCL/m04900_a1-pure.cl
+++ b/OpenCL/m04900_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m04900_a3-optimized.cl b/OpenCL/m04900_a3-optimized.cl
index e3aa883cc..199e11ca4 100644
--- a/OpenCL/m04900_a3-optimized.cl
+++ b/OpenCL/m04900_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m04900_a3-pure.cl b/OpenCL/m04900_a3-pure.cl
index 6d6b02b9f..c9d731d86 100644
--- a/OpenCL/m04900_a3-pure.cl
+++ b/OpenCL/m04900_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m05100_a0-optimized.cl b/OpenCL/m05100_a0-optimized.cl
index 90441fc0a..4790fef9b 100644
--- a/OpenCL/m05100_a0-optimized.cl
+++ b/OpenCL/m05100_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m05100_a0-pure.cl b/OpenCL/m05100_a0-pure.cl
index f71d763b9..41f3ce539 100644
--- a/OpenCL/m05100_a0-pure.cl
+++ b/OpenCL/m05100_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m05100_a1-optimized.cl b/OpenCL/m05100_a1-optimized.cl
index efa881b95..d3f20fdd0 100644
--- a/OpenCL/m05100_a1-optimized.cl
+++ b/OpenCL/m05100_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m05100_a1-pure.cl b/OpenCL/m05100_a1-pure.cl
index 3e8e8c153..861e93519 100644
--- a/OpenCL/m05100_a1-pure.cl
+++ b/OpenCL/m05100_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m05100_a3-optimized.cl b/OpenCL/m05100_a3-optimized.cl
index f4be1c794..2c897959b 100644
--- a/OpenCL/m05100_a3-optimized.cl
+++ b/OpenCL/m05100_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m05100_a3-pure.cl b/OpenCL/m05100_a3-pure.cl
index 6dcea48d1..02393f913 100644
--- a/OpenCL/m05100_a3-pure.cl
+++ b/OpenCL/m05100_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m05200-pure.cl b/OpenCL/m05200-pure.cl
index 8edf41a04..194b0fa02 100644
--- a/OpenCL/m05200-pure.cl
+++ b/OpenCL/m05200-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m05300_a0-optimized.cl b/OpenCL/m05300_a0-optimized.cl
index 11a175095..3897688e7 100644
--- a/OpenCL/m05300_a0-optimized.cl
+++ b/OpenCL/m05300_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -124,21 +125,21 @@ KERNEL_FQ void m05300_m04 (KERN_ATTR_RULES_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = esalt_bufs[digests_offset].nr_buf[i];
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = esalt_bufs[digests_offset].msg_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -299,21 +300,21 @@ KERNEL_FQ void m05300_s04 (KERN_ATTR_RULES_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = esalt_bufs[digests_offset].nr_buf[i];
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = esalt_bufs[digests_offset].msg_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05300_a0-pure.cl b/OpenCL/m05300_a0-pure.cl
index 8337daff1..53d48ecaa 100644
--- a/OpenCL/m05300_a0-pure.cl
+++ b/OpenCL/m05300_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m05300_a1-optimized.cl b/OpenCL/m05300_a1-optimized.cl
index 290f60a35..5f75de929 100644
--- a/OpenCL/m05300_a1-optimized.cl
+++ b/OpenCL/m05300_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -122,21 +123,21 @@ KERNEL_FQ void m05300_m04 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = esalt_bufs[digests_offset].nr_buf[i];
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = esalt_bufs[digests_offset].msg_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -357,21 +358,21 @@ KERNEL_FQ void m05300_s04 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = esalt_bufs[digests_offset].nr_buf[i];
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = esalt_bufs[digests_offset].msg_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05300_a1-pure.cl b/OpenCL/m05300_a1-pure.cl
index 8aa4b80d3..39b917d91 100644
--- a/OpenCL/m05300_a1-pure.cl
+++ b/OpenCL/m05300_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m05300_a3-optimized.cl b/OpenCL/m05300_a3-optimized.cl
index c228edc09..5735c7e80 100644
--- a/OpenCL/m05300_a3-optimized.cl
+++ b/OpenCL/m05300_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -428,21 +429,21 @@ KERNEL_FQ void m05300_m04 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = esalt_bufs[digests_offset].nr_buf[i];
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = esalt_bufs[digests_offset].msg_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -501,21 +502,21 @@ KERNEL_FQ void m05300_m08 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = esalt_bufs[digests_offset].nr_buf[i];
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = esalt_bufs[digests_offset].msg_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -574,21 +575,21 @@ KERNEL_FQ void m05300_m16 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = esalt_bufs[digests_offset].nr_buf[i];
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = esalt_bufs[digests_offset].msg_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -647,21 +648,21 @@ KERNEL_FQ void m05300_s04 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = esalt_bufs[digests_offset].nr_buf[i];
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = esalt_bufs[digests_offset].msg_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -720,21 +721,21 @@ KERNEL_FQ void m05300_s08 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = esalt_bufs[digests_offset].nr_buf[i];
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = esalt_bufs[digests_offset].msg_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -793,21 +794,21 @@ KERNEL_FQ void m05300_s16 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = esalt_bufs[digests_offset].nr_buf[i];
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = esalt_bufs[digests_offset].msg_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05300_a3-pure.cl b/OpenCL/m05300_a3-pure.cl
index c89b6ab10..76d6c0bc9 100644
--- a/OpenCL/m05300_a3-pure.cl
+++ b/OpenCL/m05300_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m05400_a0-optimized.cl b/OpenCL/m05400_a0-optimized.cl
index 06cfe4d95..fe656c1cb 100644
--- a/OpenCL/m05400_a0-optimized.cl
+++ b/OpenCL/m05400_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -128,21 +129,21 @@ KERNEL_FQ void m05400_m04 (KERN_ATTR_RULES_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].nr_buf[i]);
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].msg_buf[i]);
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -312,21 +313,21 @@ KERNEL_FQ void m05400_s04 (KERN_ATTR_RULES_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].nr_buf[i]);
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].msg_buf[i]);
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05400_a0-pure.cl b/OpenCL/m05400_a0-pure.cl
index de3dee4bf..f909c3846 100644
--- a/OpenCL/m05400_a0-pure.cl
+++ b/OpenCL/m05400_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m05400_a1-optimized.cl b/OpenCL/m05400_a1-optimized.cl
index 576f280d4..b8b95ce91 100644
--- a/OpenCL/m05400_a1-optimized.cl
+++ b/OpenCL/m05400_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -126,21 +127,21 @@ KERNEL_FQ void m05400_m04 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].nr_buf[i]);
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].msg_buf[i]);
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -378,21 +379,21 @@ KERNEL_FQ void m05400_s04 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].nr_buf[i]);
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].msg_buf[i]);
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05400_a1-pure.cl b/OpenCL/m05400_a1-pure.cl
index abb5cfce3..f2d054bc6 100644
--- a/OpenCL/m05400_a1-pure.cl
+++ b/OpenCL/m05400_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m05400_a3-optimized.cl b/OpenCL/m05400_a3-optimized.cl
index 21d723531..7919d7a9a 100644
--- a/OpenCL/m05400_a3-optimized.cl
+++ b/OpenCL/m05400_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -432,21 +433,21 @@ KERNEL_FQ void m05400_m04 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].nr_buf[i]);
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].msg_buf[i]);
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -505,21 +506,21 @@ KERNEL_FQ void m05400_m08 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].nr_buf[i]);
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].msg_buf[i]);
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -578,21 +579,21 @@ KERNEL_FQ void m05400_m16 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].nr_buf[i]);
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].msg_buf[i]);
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -651,21 +652,21 @@ KERNEL_FQ void m05400_s04 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].nr_buf[i]);
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].msg_buf[i]);
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -724,21 +725,21 @@ KERNEL_FQ void m05400_s08 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].nr_buf[i]);
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].msg_buf[i]);
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -797,21 +798,21 @@ KERNEL_FQ void m05400_s16 (KERN_ATTR_ESALT (ikepsk_t))
    * s_msg
    */
 
-  LOCAL_AS u32 s_nr_buf[16];
+  LOCAL_VK u32 s_nr_buf[16];
 
   for (u32 i = lid; i < 16; i += lsz)
   {
     s_nr_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].nr_buf[i]);
   }
 
-  LOCAL_AS u32 s_msg_buf[128];
+  LOCAL_VK u32 s_msg_buf[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_msg_buf[i] = hc_swap32_S (esalt_bufs[digests_offset].msg_buf[i]);
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05400_a3-pure.cl b/OpenCL/m05400_a3-pure.cl
index ed16f2ca9..8e4c63305 100644
--- a/OpenCL/m05400_a3-pure.cl
+++ b/OpenCL/m05400_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m05500_a0-optimized.cl b/OpenCL/m05500_a0-optimized.cl
index 7db6100e7..6a5c8451c 100644
--- a/OpenCL/m05500_a0-optimized.cl
+++ b/OpenCL/m05500_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -47,7 +48,7 @@ typedef struct netntlm
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -195,7 +196,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -346,13 +347,13 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32x *iv, u32x *data, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -515,8 +516,8 @@ KERNEL_FQ void m05500_m04 (KERN_ATTR_RULES ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -539,7 +540,7 @@ KERNEL_FQ void m05500_m04 (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -728,8 +729,8 @@ KERNEL_FQ void m05500_s04 (KERN_ATTR_RULES ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -752,7 +753,7 @@ KERNEL_FQ void m05500_s04 (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05500_a0-pure.cl b/OpenCL/m05500_a0-pure.cl
index f83be3388..7681eea15 100644
--- a/OpenCL/m05500_a0-pure.cl
+++ b/OpenCL/m05500_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -47,7 +48,7 @@ typedef struct netntlm
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -195,7 +196,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -346,13 +347,13 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32 *iv, u32 *data, u32 *Kc, u32 *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -515,8 +516,8 @@ KERNEL_FQ void m05500_mxx (KERN_ATTR_RULES ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -539,7 +540,7 @@ KERNEL_FQ void m05500_mxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -639,8 +640,8 @@ KERNEL_FQ void m05500_sxx (KERN_ATTR_RULES ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -663,7 +664,7 @@ KERNEL_FQ void m05500_sxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05500_a1-optimized.cl b/OpenCL/m05500_a1-optimized.cl
index 20de5bc7e..d4ae2526a 100644
--- a/OpenCL/m05500_a1-optimized.cl
+++ b/OpenCL/m05500_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -45,7 +46,7 @@ typedef struct netntlm
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -193,7 +194,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -344,13 +345,13 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32x *iv, u32x *data, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -513,8 +514,8 @@ KERNEL_FQ void m05500_m04 (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -537,7 +538,7 @@ KERNEL_FQ void m05500_m04 (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -779,8 +780,8 @@ KERNEL_FQ void m05500_s04 (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -803,7 +804,7 @@ KERNEL_FQ void m05500_s04 (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05500_a1-pure.cl b/OpenCL/m05500_a1-pure.cl
index 14e152681..28b5a627c 100644
--- a/OpenCL/m05500_a1-pure.cl
+++ b/OpenCL/m05500_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md4.cl"
@@ -45,7 +46,7 @@ typedef struct netntlm
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -193,7 +194,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -344,13 +345,13 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32 *iv, u32 *data, u32 *Kc, u32 *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -513,8 +514,8 @@ KERNEL_FQ void m05500_mxx (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -537,7 +538,7 @@ KERNEL_FQ void m05500_mxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -635,8 +636,8 @@ KERNEL_FQ void m05500_sxx (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -659,7 +660,7 @@ KERNEL_FQ void m05500_sxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05500_a3-optimized.cl b/OpenCL/m05500_a3-optimized.cl
index 5f43cd563..90a3c3b14 100644
--- a/OpenCL/m05500_a3-optimized.cl
+++ b/OpenCL/m05500_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -45,7 +46,7 @@ typedef struct netntlm
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -193,7 +194,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -344,13 +345,13 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32x *iv, u32x *data, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -846,8 +847,8 @@ KERNEL_FQ void m05500_m04 (KERN_ATTR_VECTOR ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -870,7 +871,7 @@ KERNEL_FQ void m05500_m04 (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -920,8 +921,8 @@ KERNEL_FQ void m05500_m08 (KERN_ATTR_VECTOR ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -944,7 +945,7 @@ KERNEL_FQ void m05500_m08 (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -994,8 +995,8 @@ KERNEL_FQ void m05500_m16 (KERN_ATTR_VECTOR ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -1018,7 +1019,7 @@ KERNEL_FQ void m05500_m16 (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1068,8 +1069,8 @@ KERNEL_FQ void m05500_s04 (KERN_ATTR_VECTOR ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -1092,7 +1093,7 @@ KERNEL_FQ void m05500_s04 (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1142,8 +1143,8 @@ KERNEL_FQ void m05500_s08 (KERN_ATTR_VECTOR ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -1166,7 +1167,7 @@ KERNEL_FQ void m05500_s08 (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1216,8 +1217,8 @@ KERNEL_FQ void m05500_s16 (KERN_ATTR_VECTOR ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -1240,7 +1241,7 @@ KERNEL_FQ void m05500_s16 (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05500_a3-pure.cl b/OpenCL/m05500_a3-pure.cl
index c4aa11718..25e6392fb 100644
--- a/OpenCL/m05500_a3-pure.cl
+++ b/OpenCL/m05500_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -45,7 +46,7 @@ typedef struct netntlm
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -193,7 +194,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -344,13 +345,13 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32x *iv, u32x *data, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -513,8 +514,8 @@ KERNEL_FQ void m05500_mxx (KERN_ATTR_VECTOR ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -537,7 +538,7 @@ KERNEL_FQ void m05500_mxx (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -648,8 +649,8 @@ KERNEL_FQ void m05500_sxx (KERN_ATTR_VECTOR ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -672,7 +673,7 @@ KERNEL_FQ void m05500_sxx (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05600_a0-optimized.cl b/OpenCL/m05600_a0-optimized.cl
index a1cb6f71d..d0eef25cd 100644
--- a/OpenCL/m05600_a0-optimized.cl
+++ b/OpenCL/m05600_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -127,21 +128,21 @@ KERNEL_FQ void m05600_m04 (KERN_ATTR_RULES_ESALT (netntlm_t))
    * salt
    */
 
-  LOCAL_AS u32 s_userdomain_buf[64];
+  LOCAL_VK u32 s_userdomain_buf[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
     s_userdomain_buf[i] = esalt_bufs[digests_offset].userdomain_buf[i];
   }
 
-  LOCAL_AS u32 s_chall_buf[256];
+  LOCAL_VK u32 s_chall_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_chall_buf[i] = esalt_bufs[digests_offset].chall_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -366,21 +367,21 @@ KERNEL_FQ void m05600_s04 (KERN_ATTR_RULES_ESALT (netntlm_t))
    * salt
    */
 
-  LOCAL_AS u32 s_userdomain_buf[64];
+  LOCAL_VK u32 s_userdomain_buf[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
     s_userdomain_buf[i] = esalt_bufs[digests_offset].userdomain_buf[i];
   }
 
-  LOCAL_AS u32 s_chall_buf[256];
+  LOCAL_VK u32 s_chall_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_chall_buf[i] = esalt_bufs[digests_offset].chall_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05600_a0-pure.cl b/OpenCL/m05600_a0-pure.cl
index 699e5e2df..25d56067f 100644
--- a/OpenCL/m05600_a0-pure.cl
+++ b/OpenCL/m05600_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m05600_a1-optimized.cl b/OpenCL/m05600_a1-optimized.cl
index 16af9bf75..dec67ea96 100644
--- a/OpenCL/m05600_a1-optimized.cl
+++ b/OpenCL/m05600_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -125,21 +126,21 @@ KERNEL_FQ void m05600_m04 (KERN_ATTR_ESALT (netntlm_t))
    * salt
    */
 
-  LOCAL_AS u32 s_userdomain_buf[64];
+  LOCAL_VK u32 s_userdomain_buf[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
     s_userdomain_buf[i] = esalt_bufs[digests_offset].userdomain_buf[i];
   }
 
-  LOCAL_AS u32 s_chall_buf[256];
+  LOCAL_VK u32 s_chall_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_chall_buf[i] = esalt_bufs[digests_offset].chall_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -422,21 +423,21 @@ KERNEL_FQ void m05600_s04 (KERN_ATTR_ESALT (netntlm_t))
    * salt
    */
 
-  LOCAL_AS u32 s_userdomain_buf[64];
+  LOCAL_VK u32 s_userdomain_buf[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
     s_userdomain_buf[i] = esalt_bufs[digests_offset].userdomain_buf[i];
   }
 
-  LOCAL_AS u32 s_chall_buf[256];
+  LOCAL_VK u32 s_chall_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_chall_buf[i] = esalt_bufs[digests_offset].chall_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05600_a1-pure.cl b/OpenCL/m05600_a1-pure.cl
index aefa8323c..91ea91a53 100644
--- a/OpenCL/m05600_a1-pure.cl
+++ b/OpenCL/m05600_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m05600_a3-optimized.cl b/OpenCL/m05600_a3-optimized.cl
index ba42483e9..1706db730 100644
--- a/OpenCL/m05600_a3-optimized.cl
+++ b/OpenCL/m05600_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -549,21 +550,21 @@ KERNEL_FQ void m05600_m04 (KERN_ATTR_ESALT (netntlm_t))
    * salt
    */
 
-  LOCAL_AS u32 s_userdomain_buf[64];
+  LOCAL_VK u32 s_userdomain_buf[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
     s_userdomain_buf[i] = esalt_bufs[digests_offset].userdomain_buf[i];
   }
 
-  LOCAL_AS u32 s_chall_buf[256];
+  LOCAL_VK u32 s_chall_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_chall_buf[i] = esalt_bufs[digests_offset].chall_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -622,21 +623,21 @@ KERNEL_FQ void m05600_m08 (KERN_ATTR_ESALT (netntlm_t))
    * salt
    */
 
-  LOCAL_AS u32 s_userdomain_buf[64];
+  LOCAL_VK u32 s_userdomain_buf[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
     s_userdomain_buf[i] = esalt_bufs[digests_offset].userdomain_buf[i];
   }
 
-  LOCAL_AS u32 s_chall_buf[256];
+  LOCAL_VK u32 s_chall_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_chall_buf[i] = esalt_bufs[digests_offset].chall_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -695,21 +696,21 @@ KERNEL_FQ void m05600_m16 (KERN_ATTR_ESALT (netntlm_t))
    * salt
    */
 
-  LOCAL_AS u32 s_userdomain_buf[64];
+  LOCAL_VK u32 s_userdomain_buf[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
     s_userdomain_buf[i] = esalt_bufs[digests_offset].userdomain_buf[i];
   }
 
-  LOCAL_AS u32 s_chall_buf[256];
+  LOCAL_VK u32 s_chall_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_chall_buf[i] = esalt_bufs[digests_offset].chall_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -768,21 +769,21 @@ KERNEL_FQ void m05600_s04 (KERN_ATTR_ESALT (netntlm_t))
    * salt
    */
 
-  LOCAL_AS u32 s_userdomain_buf[64];
+  LOCAL_VK u32 s_userdomain_buf[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
     s_userdomain_buf[i] = esalt_bufs[digests_offset].userdomain_buf[i];
   }
 
-  LOCAL_AS u32 s_chall_buf[256];
+  LOCAL_VK u32 s_chall_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_chall_buf[i] = esalt_bufs[digests_offset].chall_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -841,21 +842,21 @@ KERNEL_FQ void m05600_s08 (KERN_ATTR_ESALT (netntlm_t))
    * salt
    */
 
-  LOCAL_AS u32 s_userdomain_buf[64];
+  LOCAL_VK u32 s_userdomain_buf[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
     s_userdomain_buf[i] = esalt_bufs[digests_offset].userdomain_buf[i];
   }
 
-  LOCAL_AS u32 s_chall_buf[256];
+  LOCAL_VK u32 s_chall_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_chall_buf[i] = esalt_bufs[digests_offset].chall_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -914,21 +915,21 @@ KERNEL_FQ void m05600_s16 (KERN_ATTR_ESALT (netntlm_t))
    * salt
    */
 
-  LOCAL_AS u32 s_userdomain_buf[64];
+  LOCAL_VK u32 s_userdomain_buf[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
     s_userdomain_buf[i] = esalt_bufs[digests_offset].userdomain_buf[i];
   }
 
-  LOCAL_AS u32 s_chall_buf[256];
+  LOCAL_VK u32 s_chall_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_chall_buf[i] = esalt_bufs[digests_offset].chall_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05600_a3-pure.cl b/OpenCL/m05600_a3-pure.cl
index d5d48f5b7..7f102a56f 100644
--- a/OpenCL/m05600_a3-pure.cl
+++ b/OpenCL/m05600_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md4.cl"
diff --git a/OpenCL/m05800-optimized.cl b/OpenCL/m05800-optimized.cl
index 339376659..4de73c5ab 100644
--- a/OpenCL/m05800-optimized.cl
+++ b/OpenCL/m05800-optimized.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha1.cl"
 #endif
@@ -19,7 +20,7 @@ typedef struct androidpin_tmp
 
 } androidpin_tmp_t;
 
-CONSTANT_AS u32a c_pc_dec[1024] =
+CONSTANT_VK u32a c_pc_dec[1024] =
 {
   0x00000030,
   0x00000031,
@@ -1047,7 +1048,7 @@ CONSTANT_AS u32a c_pc_dec[1024] =
   0x33323031,
 };
 
-CONSTANT_AS u32a c_pc_len[1024] =
+CONSTANT_VK u32a c_pc_len[1024] =
 {
   1,
   1,
@@ -2299,8 +2300,8 @@ KERNEL_FQ void m05800_loop (KERN_ATTR_TMPS (androidpin_tmp_t))
    * cache precomputed conversion table in shared memory
    */
 
-  LOCAL_AS u32 s_pc_dec[1024];
-  LOCAL_AS u32 s_pc_len[1024];
+  LOCAL_VK u32 s_pc_dec[1024];
+  LOCAL_VK u32 s_pc_len[1024];
 
   for (u32 i = lid; i < 1024; i += lsz)
   {
@@ -2308,7 +2309,7 @@ KERNEL_FQ void m05800_loop (KERN_ATTR_TMPS (androidpin_tmp_t))
     s_pc_len[i] = c_pc_len[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m05800-pure.cl b/OpenCL/m05800-pure.cl
index a397ae119..6e6f0d14c 100644
--- a/OpenCL/m05800-pure.cl
+++ b/OpenCL/m05800-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha1.cl"
 #endif
@@ -19,7 +20,7 @@ typedef struct androidpin_tmp
 
 } androidpin_tmp_t;
 
-CONSTANT_AS u32a c_pc_dec[1024] =
+CONSTANT_VK u32a c_pc_dec[1024] =
 {
   0x00000030,
   0x00000031,
@@ -1047,7 +1048,7 @@ CONSTANT_AS u32a c_pc_dec[1024] =
   0x33323031,
 };
 
-CONSTANT_AS u32a c_pc_len[1024] =
+CONSTANT_VK u32a c_pc_len[1024] =
 {
   1,
   1,
@@ -2119,8 +2120,8 @@ KERNEL_FQ void m05800_loop (KERN_ATTR_TMPS (androidpin_tmp_t))
    * cache precomputed conversion table in shared memory
    */
 
-  LOCAL_AS u32 s_pc_dec[1024];
-  LOCAL_AS u32 s_pc_len[1024];
+  LOCAL_VK u32 s_pc_dec[1024];
+  LOCAL_VK u32 s_pc_len[1024];
 
   for (u32 i = lid; i < 1024; i += lsz)
   {
@@ -2128,7 +2129,7 @@ KERNEL_FQ void m05800_loop (KERN_ATTR_TMPS (androidpin_tmp_t))
     s_pc_len[i] = c_pc_len[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m06000_a0-optimized.cl b/OpenCL/m06000_a0-optimized.cl
index e38bfd5a6..87ef31990 100644
--- a/OpenCL/m06000_a0-optimized.cl
+++ b/OpenCL/m06000_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m06000_a0-pure.cl b/OpenCL/m06000_a0-pure.cl
index 0573efd7d..d25b558ae 100644
--- a/OpenCL/m06000_a0-pure.cl
+++ b/OpenCL/m06000_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m06000_a1-optimized.cl b/OpenCL/m06000_a1-optimized.cl
index 640132315..05934117d 100644
--- a/OpenCL/m06000_a1-optimized.cl
+++ b/OpenCL/m06000_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_ripemd160.cl"
diff --git a/OpenCL/m06000_a1-pure.cl b/OpenCL/m06000_a1-pure.cl
index fbbea68ad..1eda2781d 100644
--- a/OpenCL/m06000_a1-pure.cl
+++ b/OpenCL/m06000_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_ripemd160.cl"
diff --git a/OpenCL/m06000_a3-optimized.cl b/OpenCL/m06000_a3-optimized.cl
index 966fcf15f..fab740436 100644
--- a/OpenCL/m06000_a3-optimized.cl
+++ b/OpenCL/m06000_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_ripemd160.cl"
diff --git a/OpenCL/m06000_a3-pure.cl b/OpenCL/m06000_a3-pure.cl
index d7bbe5a02..3fbbaca59 100644
--- a/OpenCL/m06000_a3-pure.cl
+++ b/OpenCL/m06000_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_ripemd160.cl"
diff --git a/OpenCL/m06100_a0-optimized.cl b/OpenCL/m06100_a0-optimized.cl
index 046d0c4f9..7b1c6eee9 100644
--- a/OpenCL/m06100_a0-optimized.cl
+++ b/OpenCL/m06100_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -36,8 +37,8 @@ KERNEL_FQ void m06100_m04 (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -60,7 +61,7 @@ KERNEL_FQ void m06100_m04 (KERN_ATTR_RULES ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -176,8 +177,8 @@ KERNEL_FQ void m06100_s04 (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -200,7 +201,7 @@ KERNEL_FQ void m06100_s04 (KERN_ATTR_RULES ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m06100_a0-pure.cl b/OpenCL/m06100_a0-pure.cl
index deca7a8ba..408512f66 100644
--- a/OpenCL/m06100_a0-pure.cl
+++ b/OpenCL/m06100_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -31,8 +32,8 @@ KERNEL_FQ void m06100_mxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -55,7 +56,7 @@ KERNEL_FQ void m06100_mxx (KERN_ATTR_RULES ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -115,8 +116,8 @@ KERNEL_FQ void m06100_sxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -139,7 +140,7 @@ KERNEL_FQ void m06100_sxx (KERN_ATTR_RULES ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m06100_a1-optimized.cl b/OpenCL/m06100_a1-optimized.cl
index f81455869..54a9f492f 100644
--- a/OpenCL/m06100_a1-optimized.cl
+++ b/OpenCL/m06100_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_whirlpool.cl"
@@ -34,8 +35,8 @@ KERNEL_FQ void m06100_m04 (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -58,7 +59,7 @@ KERNEL_FQ void m06100_m04 (KERN_ATTR_BASIC ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -232,8 +233,8 @@ KERNEL_FQ void m06100_s04 (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -256,7 +257,7 @@ KERNEL_FQ void m06100_s04 (KERN_ATTR_BASIC ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m06100_a1-pure.cl b/OpenCL/m06100_a1-pure.cl
index 611927094..7d7693177 100644
--- a/OpenCL/m06100_a1-pure.cl
+++ b/OpenCL/m06100_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_whirlpool.cl"
@@ -29,8 +30,8 @@ KERNEL_FQ void m06100_mxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -53,7 +54,7 @@ KERNEL_FQ void m06100_mxx (KERN_ATTR_BASIC ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -111,8 +112,8 @@ KERNEL_FQ void m06100_sxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -135,7 +136,7 @@ KERNEL_FQ void m06100_sxx (KERN_ATTR_BASIC ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m06100_a3-optimized.cl b/OpenCL/m06100_a3-optimized.cl
index da2d8745e..e61112e07 100644
--- a/OpenCL/m06100_a3-optimized.cl
+++ b/OpenCL/m06100_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_whirlpool.cl"
@@ -184,8 +185,8 @@ KERNEL_FQ void m06100_m04 (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -208,7 +209,7 @@ KERNEL_FQ void m06100_m04 (KERN_ATTR_BASIC ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -276,8 +277,8 @@ KERNEL_FQ void m06100_m08 (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -300,7 +301,7 @@ KERNEL_FQ void m06100_m08 (KERN_ATTR_BASIC ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -372,8 +373,8 @@ KERNEL_FQ void m06100_s04 (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -396,7 +397,7 @@ KERNEL_FQ void m06100_s04 (KERN_ATTR_BASIC ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -464,8 +465,8 @@ KERNEL_FQ void m06100_s08 (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -488,7 +489,7 @@ KERNEL_FQ void m06100_s08 (KERN_ATTR_BASIC ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m06100_a3-pure.cl b/OpenCL/m06100_a3-pure.cl
index 3f2f65bfd..8e4038c99 100644
--- a/OpenCL/m06100_a3-pure.cl
+++ b/OpenCL/m06100_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_whirlpool.cl"
@@ -29,8 +30,8 @@ KERNEL_FQ void m06100_mxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -53,7 +54,7 @@ KERNEL_FQ void m06100_mxx (KERN_ATTR_VECTOR ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -124,8 +125,8 @@ KERNEL_FQ void m06100_sxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -148,7 +149,7 @@ KERNEL_FQ void m06100_sxx (KERN_ATTR_VECTOR ())
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m06211-pure.cl b/OpenCL/m06211-pure.cl
index 529049401..62e9dd236 100644
--- a/OpenCL/m06211-pure.cl
+++ b/OpenCL/m06211-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_ripemd160.cl"
@@ -92,14 +93,14 @@ KERNEL_FQ void m06211_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -304,17 +305,17 @@ KERNEL_FQ void m06211_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -331,7 +332,7 @@ KERNEL_FQ void m06211_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -373,14 +374,6 @@ KERNEL_FQ void m06211_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey2[6] = tmps[gid].out[14];
   ukey2[7] = tmps[gid].out[15];
 
-  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -396,4 +389,12 @@ KERNEL_FQ void m06211_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
     }
   }
+
+  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/m06212-pure.cl b/OpenCL/m06212-pure.cl
index f6228daea..f8b2665b7 100644
--- a/OpenCL/m06212-pure.cl
+++ b/OpenCL/m06212-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_ripemd160.cl"
@@ -92,14 +93,14 @@ KERNEL_FQ void m06212_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -304,17 +305,17 @@ KERNEL_FQ void m06212_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -331,7 +332,7 @@ KERNEL_FQ void m06212_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -373,14 +374,6 @@ KERNEL_FQ void m06212_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey2[6] = tmps[gid].out[14];
   ukey2[7] = tmps[gid].out[15];
 
-  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -397,6 +390,14 @@ KERNEL_FQ void m06212_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     }
   }
 
+  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
+
   u32 ukey3[8];
 
   ukey3[0] = tmps[gid].out[16];
@@ -419,14 +420,6 @@ KERNEL_FQ void m06212_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey4[6] = tmps[gid].out[30];
   ukey4[7] = tmps[gid].out[31];
 
-  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -442,4 +435,12 @@ KERNEL_FQ void m06212_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
     }
   }
+
+  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/m06213-pure.cl b/OpenCL/m06213-pure.cl
index 1bd79a274..310e6fe40 100644
--- a/OpenCL/m06213-pure.cl
+++ b/OpenCL/m06213-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_ripemd160.cl"
@@ -92,14 +93,14 @@ KERNEL_FQ void m06213_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -304,17 +305,17 @@ KERNEL_FQ void m06213_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -331,7 +332,7 @@ KERNEL_FQ void m06213_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -373,14 +374,6 @@ KERNEL_FQ void m06213_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey2[6] = tmps[gid].out[14];
   ukey2[7] = tmps[gid].out[15];
 
-  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -397,6 +390,14 @@ KERNEL_FQ void m06213_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     }
   }
 
+  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
+
   u32 ukey3[8];
 
   ukey3[0] = tmps[gid].out[16];
@@ -419,14 +420,6 @@ KERNEL_FQ void m06213_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey4[6] = tmps[gid].out[30];
   ukey4[7] = tmps[gid].out[31];
 
-  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -443,6 +436,14 @@ KERNEL_FQ void m06213_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     }
   }
 
+  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
+
   u32 ukey5[8];
 
   ukey5[0] = tmps[gid].out[32];
@@ -465,14 +466,6 @@ KERNEL_FQ void m06213_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey6[6] = tmps[gid].out[46];
   ukey6[7] = tmps[gid].out[47];
 
-  if (verify_header_aes_twofish_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent_twofish_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -480,4 +473,12 @@ KERNEL_FQ void m06213_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
     }
   }
+
+  if (verify_header_aes_twofish_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/m06221-pure.cl b/OpenCL/m06221-pure.cl
index 75e6a556e..83bead7bf 100644
--- a/OpenCL/m06221-pure.cl
+++ b/OpenCL/m06221-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -114,14 +115,14 @@ KERNEL_FQ void m06221_init (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -448,17 +449,17 @@ KERNEL_FQ void m06221_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -475,7 +476,7 @@ KERNEL_FQ void m06221_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -517,14 +518,6 @@ KERNEL_FQ void m06221_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
   ukey2[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[7]));
   ukey2[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[7]));
 
-  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -540,4 +533,12 @@ KERNEL_FQ void m06221_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
     }
   }
+
+  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/m06222-pure.cl b/OpenCL/m06222-pure.cl
index f39eba2a5..e243eedce 100644
--- a/OpenCL/m06222-pure.cl
+++ b/OpenCL/m06222-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -114,14 +115,14 @@ KERNEL_FQ void m06222_init (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -448,17 +449,17 @@ KERNEL_FQ void m06222_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -475,7 +476,7 @@ KERNEL_FQ void m06222_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -517,14 +518,6 @@ KERNEL_FQ void m06222_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
   ukey2[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[7]));
   ukey2[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[7]));
 
-  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -541,6 +534,14 @@ KERNEL_FQ void m06222_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
     }
   }
 
+  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
+
   u32 ukey3[8];
 
   ukey3[0] = hc_swap32_S (h32_from_64_S (tmps[gid].out[ 8]));
@@ -563,14 +564,6 @@ KERNEL_FQ void m06222_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
   ukey4[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[15]));
   ukey4[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[15]));
 
-  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -586,4 +579,12 @@ KERNEL_FQ void m06222_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
     }
   }
+
+  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/m06223-pure.cl b/OpenCL/m06223-pure.cl
index 2c2468f59..58591daa3 100644
--- a/OpenCL/m06223-pure.cl
+++ b/OpenCL/m06223-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -114,14 +115,14 @@ KERNEL_FQ void m06223_init (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -448,17 +449,17 @@ KERNEL_FQ void m06223_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -475,7 +476,7 @@ KERNEL_FQ void m06223_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -517,14 +518,6 @@ KERNEL_FQ void m06223_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
   ukey2[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[7]));
   ukey2[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[7]));
 
-  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -541,6 +534,14 @@ KERNEL_FQ void m06223_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
     }
   }
 
+  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
+
   u32 ukey3[8];
 
   ukey3[0] = hc_swap32_S (h32_from_64_S (tmps[gid].out[ 8]));
@@ -563,14 +564,6 @@ KERNEL_FQ void m06223_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
   ukey4[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[15]));
   ukey4[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[15]));
 
-  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -587,6 +580,14 @@ KERNEL_FQ void m06223_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
     }
   }
 
+  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
+
   u32 ukey5[8];
 
   ukey5[0] = hc_swap32_S (h32_from_64_S (tmps[gid].out[16]));
@@ -609,14 +610,6 @@ KERNEL_FQ void m06223_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
   ukey6[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[23]));
   ukey6[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[23]));
 
-  if (verify_header_aes_twofish_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent_twofish_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -624,4 +617,12 @@ KERNEL_FQ void m06223_comp (KERN_ATTR_TMPS_ESALT (tc64_tmp_t, tc_t))
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
     }
   }
+
+  if (verify_header_aes_twofish_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/m06231-pure.cl b/OpenCL/m06231-pure.cl
index 57fc6cdcc..a25207ee9 100644
--- a/OpenCL/m06231-pure.cl
+++ b/OpenCL/m06231-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_whirlpool.cl"
@@ -152,14 +153,14 @@ KERNEL_FQ void m06231_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   /**
    * Whirlpool shared
@@ -167,8 +168,8 @@ KERNEL_FQ void m06231_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -191,7 +192,7 @@ KERNEL_FQ void m06231_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -379,8 +380,8 @@ KERNEL_FQ void m06231_loop (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -403,7 +404,7 @@ KERNEL_FQ void m06231_loop (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -582,17 +583,17 @@ KERNEL_FQ void m06231_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -609,7 +610,7 @@ KERNEL_FQ void m06231_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -633,8 +634,8 @@ KERNEL_FQ void m06231_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -657,7 +658,7 @@ KERNEL_FQ void m06231_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -690,14 +691,6 @@ KERNEL_FQ void m06231_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey2[6] = hc_swap32_S (tmps[gid].out[14]);
   ukey2[7] = hc_swap32_S (tmps[gid].out[15]);
 
-  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -713,4 +706,12 @@ KERNEL_FQ void m06231_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
     }
   }
+
+  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/m06232-pure.cl b/OpenCL/m06232-pure.cl
index 9634349d5..8569df4c1 100644
--- a/OpenCL/m06232-pure.cl
+++ b/OpenCL/m06232-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_whirlpool.cl"
@@ -152,14 +153,14 @@ KERNEL_FQ void m06232_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   /**
    * Whirlpool shared
@@ -167,8 +168,8 @@ KERNEL_FQ void m06232_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -191,7 +192,7 @@ KERNEL_FQ void m06232_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -379,8 +380,8 @@ KERNEL_FQ void m06232_loop (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -403,7 +404,7 @@ KERNEL_FQ void m06232_loop (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -582,17 +583,17 @@ KERNEL_FQ void m06232_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -609,7 +610,7 @@ KERNEL_FQ void m06232_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -633,8 +634,8 @@ KERNEL_FQ void m06232_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -657,7 +658,7 @@ KERNEL_FQ void m06232_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -690,14 +691,6 @@ KERNEL_FQ void m06232_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey2[6] = hc_swap32_S (tmps[gid].out[14]);
   ukey2[7] = hc_swap32_S (tmps[gid].out[15]);
 
-  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -714,6 +707,14 @@ KERNEL_FQ void m06232_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     }
   }
 
+  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
+
   u32 ukey3[8];
 
   ukey3[0] = hc_swap32_S (tmps[gid].out[16]);
@@ -736,14 +737,6 @@ KERNEL_FQ void m06232_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey4[6] = hc_swap32_S (tmps[gid].out[30]);
   ukey4[7] = hc_swap32_S (tmps[gid].out[31]);
 
-  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -759,4 +752,12 @@ KERNEL_FQ void m06232_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
     }
   }
+
+  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/m06233-pure.cl b/OpenCL/m06233-pure.cl
index 7e4066daa..f996cf6f0 100644
--- a/OpenCL/m06233-pure.cl
+++ b/OpenCL/m06233-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_whirlpool.cl"
@@ -152,14 +153,14 @@ KERNEL_FQ void m06233_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   /**
    * Whirlpool shared
@@ -167,8 +168,8 @@ KERNEL_FQ void m06233_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -191,7 +192,7 @@ KERNEL_FQ void m06233_init (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -379,8 +380,8 @@ KERNEL_FQ void m06233_loop (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -403,7 +404,7 @@ KERNEL_FQ void m06233_loop (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -582,17 +583,17 @@ KERNEL_FQ void m06233_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -609,7 +610,7 @@ KERNEL_FQ void m06233_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -633,8 +634,8 @@ KERNEL_FQ void m06233_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -657,7 +658,7 @@ KERNEL_FQ void m06233_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -690,14 +691,6 @@ KERNEL_FQ void m06233_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey2[6] = hc_swap32_S (tmps[gid].out[14]);
   ukey2[7] = hc_swap32_S (tmps[gid].out[15]);
 
-  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -714,6 +707,14 @@ KERNEL_FQ void m06233_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     }
   }
 
+  if (verify_header_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
+
   u32 ukey3[8];
 
   ukey3[0] = hc_swap32_S (tmps[gid].out[16]);
@@ -736,14 +737,6 @@ KERNEL_FQ void m06233_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey4[6] = hc_swap32_S (tmps[gid].out[30]);
   ukey4[7] = hc_swap32_S (tmps[gid].out[31]);
 
-  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -760,6 +753,14 @@ KERNEL_FQ void m06233_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
     }
   }
 
+  if (verify_header_aes_twofish (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
+
   u32 ukey5[8];
 
   ukey5[0] = hc_swap32_S (tmps[gid].out[32]);
@@ -782,14 +783,6 @@ KERNEL_FQ void m06233_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
   ukey6[6] = hc_swap32_S (tmps[gid].out[46]);
   ukey6[7] = hc_swap32_S (tmps[gid].out[47]);
 
-  if (verify_header_aes_twofish_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[0]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
-    }
-  }
-
   if (verify_header_serpent_twofish_aes (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[0]) == 0)
@@ -797,4 +790,12 @@ KERNEL_FQ void m06233_comp (KERN_ATTR_TMPS_ESALT (tc_tmp_t, tc_t))
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
     }
   }
+
+  if (verify_header_aes_twofish_serpent (esalt_bufs[0].data_buf, esalt_bufs[0].signature, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[0]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, 0, gid, 0, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/m06300-optimized.cl b/OpenCL/m06300-optimized.cl
index fb1169aea..fe57526b5 100644
--- a/OpenCL/m06300-optimized.cl
+++ b/OpenCL/m06300-optimized.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md5.cl"
 #endif
diff --git a/OpenCL/m06300-pure.cl b/OpenCL/m06300-pure.cl
index 360f2e4e9..ec654ab6d 100644
--- a/OpenCL/m06300-pure.cl
+++ b/OpenCL/m06300-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md5.cl"
 #endif
diff --git a/OpenCL/m06400-pure.cl b/OpenCL/m06400-pure.cl
index 0b7ba781e..82d30025c 100644
--- a/OpenCL/m06400-pure.cl
+++ b/OpenCL/m06400-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m06500-pure.cl b/OpenCL/m06500-pure.cl
index 4a4a5b449..73a154a64 100644
--- a/OpenCL/m06500-pure.cl
+++ b/OpenCL/m06500-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m06600-pure.cl b/OpenCL/m06600-pure.cl
index b84adff7b..db4906812 100644
--- a/OpenCL/m06600-pure.cl
+++ b/OpenCL/m06600-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -240,17 +241,17 @@ KERNEL_FQ void m06600_comp (KERN_ATTR_TMPS (agilekey_tmp_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -267,7 +268,7 @@ KERNEL_FQ void m06600_comp (KERN_ATTR_TMPS (agilekey_tmp_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m06700-pure.cl b/OpenCL/m06700-pure.cl
index 9c9edafc2..df9d16f1e 100644
--- a/OpenCL/m06700-pure.cl
+++ b/OpenCL/m06700-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m06800-pure.cl b/OpenCL/m06800-pure.cl
index 2bf55cd0d..a2e95556f 100644
--- a/OpenCL/m06800-pure.cl
+++ b/OpenCL/m06800-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -271,17 +272,17 @@ KERNEL_FQ void m06800_comp (KERN_ATTR_TMPS (lastpass_tmp_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -298,7 +299,7 @@ KERNEL_FQ void m06800_comp (KERN_ATTR_TMPS (lastpass_tmp_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m06900_a0-optimized.cl b/OpenCL/m06900_a0-optimized.cl
index d699856c0..5d42eb2ae 100644
--- a/OpenCL/m06900_a0-optimized.cl
+++ b/OpenCL/m06900_a0-optimized.cl
@@ -8,13 +8,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a c_tables[4][256] =
+CONSTANT_VK u32a c_tables[4][256] =
 {
   {
     0x00072000, 0x00075000, 0x00074800, 0x00071000,
@@ -285,13 +286,13 @@ CONSTANT_AS u32a c_tables[4][256] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #define _round(k1,k2,tbl)                 \
@@ -711,7 +712,7 @@ KERNEL_FQ void m06900_m04 (KERN_ATTR_RULES ())
    * sbox
    */
 
-  LOCAL_AS u32 s_tables[4][256];
+  LOCAL_VK u32 s_tables[4][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -721,7 +722,7 @@ KERNEL_FQ void m06900_m04 (KERN_ATTR_RULES ())
     s_tables[3][i] = c_tables[3][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -927,7 +928,7 @@ KERNEL_FQ void m06900_s04 (KERN_ATTR_RULES ())
    * sbox
    */
 
-  LOCAL_AS u32 s_tables[4][256];
+  LOCAL_VK u32 s_tables[4][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -937,7 +938,7 @@ KERNEL_FQ void m06900_s04 (KERN_ATTR_RULES ())
     s_tables[3][i] = c_tables[3][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m06900_a1-optimized.cl b/OpenCL/m06900_a1-optimized.cl
index a6ca9bb5d..60fed6359 100644
--- a/OpenCL/m06900_a1-optimized.cl
+++ b/OpenCL/m06900_a1-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a c_tables[4][256] =
+CONSTANT_VK u32a c_tables[4][256] =
 {
   {
     0x00072000, 0x00075000, 0x00074800, 0x00071000,
@@ -283,13 +284,13 @@ CONSTANT_AS u32a c_tables[4][256] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #define _round(k1,k2,tbl)                 \
@@ -709,7 +710,7 @@ KERNEL_FQ void m06900_m04 (KERN_ATTR_BASIC ())
    * sbox
    */
 
-  LOCAL_AS u32 s_tables[4][256];
+  LOCAL_VK u32 s_tables[4][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -719,7 +720,7 @@ KERNEL_FQ void m06900_m04 (KERN_ATTR_BASIC ())
     s_tables[3][i] = c_tables[3][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -979,7 +980,7 @@ KERNEL_FQ void m06900_s04 (KERN_ATTR_BASIC ())
    * sbox
    */
 
-  LOCAL_AS u32 s_tables[4][256];
+  LOCAL_VK u32 s_tables[4][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -989,7 +990,7 @@ KERNEL_FQ void m06900_s04 (KERN_ATTR_BASIC ())
     s_tables[3][i] = c_tables[3][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m06900_a3-optimized.cl b/OpenCL/m06900_a3-optimized.cl
index 5d9996777..a0f576783 100644
--- a/OpenCL/m06900_a3-optimized.cl
+++ b/OpenCL/m06900_a3-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a c_tables[4][256] =
+CONSTANT_VK u32a c_tables[4][256] =
 {
   {
     0x00072000, 0x00075000, 0x00074800, 0x00071000,
@@ -283,13 +284,13 @@ CONSTANT_AS u32a c_tables[4][256] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #define _round(k1,k2,tbl)                 \
@@ -1069,7 +1070,7 @@ KERNEL_FQ void m06900_m04 (KERN_ATTR_BASIC ())
    * sbox
    */
 
-  LOCAL_AS u32 s_tables[4][256];
+  LOCAL_VK u32 s_tables[4][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1079,7 +1080,7 @@ KERNEL_FQ void m06900_m04 (KERN_ATTR_BASIC ())
     s_tables[3][i] = c_tables[3][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1138,7 +1139,7 @@ KERNEL_FQ void m06900_m08 (KERN_ATTR_BASIC ())
    * sbox
    */
 
-  LOCAL_AS u32 s_tables[4][256];
+  LOCAL_VK u32 s_tables[4][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1148,7 +1149,7 @@ KERNEL_FQ void m06900_m08 (KERN_ATTR_BASIC ())
     s_tables[3][i] = c_tables[3][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1211,7 +1212,7 @@ KERNEL_FQ void m06900_s04 (KERN_ATTR_BASIC ())
    * sbox
    */
 
-  LOCAL_AS u32 s_tables[4][256];
+  LOCAL_VK u32 s_tables[4][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1221,7 +1222,7 @@ KERNEL_FQ void m06900_s04 (KERN_ATTR_BASIC ())
     s_tables[3][i] = c_tables[3][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1280,7 +1281,7 @@ KERNEL_FQ void m06900_s08 (KERN_ATTR_BASIC ())
    * sbox
    */
 
-  LOCAL_AS u32 s_tables[4][256];
+  LOCAL_VK u32 s_tables[4][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1290,7 +1291,7 @@ KERNEL_FQ void m06900_s08 (KERN_ATTR_BASIC ())
     s_tables[3][i] = c_tables[3][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m07000_a0-optimized.cl b/OpenCL/m07000_a0-optimized.cl
index f5d00ff09..b2f6f66a9 100644
--- a/OpenCL/m07000_a0-optimized.cl
+++ b/OpenCL/m07000_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m07000_a0-pure.cl b/OpenCL/m07000_a0-pure.cl
index 05c507291..f3bb57959 100644
--- a/OpenCL/m07000_a0-pure.cl
+++ b/OpenCL/m07000_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m07000_a1-optimized.cl b/OpenCL/m07000_a1-optimized.cl
index 943fcc6d1..a5c914433 100644
--- a/OpenCL/m07000_a1-optimized.cl
+++ b/OpenCL/m07000_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m07000_a1-pure.cl b/OpenCL/m07000_a1-pure.cl
index 4e623ad2d..f6f43d72c 100644
--- a/OpenCL/m07000_a1-pure.cl
+++ b/OpenCL/m07000_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m07000_a3-optimized.cl b/OpenCL/m07000_a3-optimized.cl
index f3db19268..5757962da 100644
--- a/OpenCL/m07000_a3-optimized.cl
+++ b/OpenCL/m07000_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m07000_a3-pure.cl b/OpenCL/m07000_a3-pure.cl
index 52852d473..db6f00a5f 100644
--- a/OpenCL/m07000_a3-pure.cl
+++ b/OpenCL/m07000_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m07100-pure.cl b/OpenCL/m07100-pure.cl
index 9426ceb2f..2cee7e0b0 100644
--- a/OpenCL/m07100-pure.cl
+++ b/OpenCL/m07100-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m07300_a0-optimized.cl b/OpenCL/m07300_a0-optimized.cl
index c1744096c..35f4a3639 100644
--- a/OpenCL/m07300_a0-optimized.cl
+++ b/OpenCL/m07300_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m07300_a0-pure.cl b/OpenCL/m07300_a0-pure.cl
index 879fbaa5b..582e6e7c8 100644
--- a/OpenCL/m07300_a0-pure.cl
+++ b/OpenCL/m07300_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m07300_a1-optimized.cl b/OpenCL/m07300_a1-optimized.cl
index cf142f29a..83fce7cf9 100644
--- a/OpenCL/m07300_a1-optimized.cl
+++ b/OpenCL/m07300_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m07300_a1-pure.cl b/OpenCL/m07300_a1-pure.cl
index 26bf80185..8771d4d7a 100644
--- a/OpenCL/m07300_a1-pure.cl
+++ b/OpenCL/m07300_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m07300_a3-optimized.cl b/OpenCL/m07300_a3-optimized.cl
index 7b9e7a78d..b3b74c963 100644
--- a/OpenCL/m07300_a3-optimized.cl
+++ b/OpenCL/m07300_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m07300_a3-pure.cl b/OpenCL/m07300_a3-pure.cl
index bfdf6824c..d8ef2e43c 100644
--- a/OpenCL/m07300_a3-pure.cl
+++ b/OpenCL/m07300_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m07400-optimized.cl b/OpenCL/m07400-optimized.cl
index 0b56927cb..5d8cf6af0 100644
--- a/OpenCL/m07400-optimized.cl
+++ b/OpenCL/m07400-optimized.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha256.cl"
 #endif
diff --git a/OpenCL/m07400-pure.cl b/OpenCL/m07400-pure.cl
index b822dbdad..08cfee875 100644
--- a/OpenCL/m07400-pure.cl
+++ b/OpenCL/m07400-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha256.cl"
 #endif
diff --git a/OpenCL/m07500_a0-optimized.cl b/OpenCL/m07500_a0-optimized.cl
index 028cca99b..60c658a0a 100644
--- a/OpenCL/m07500_a0-optimized.cl
+++ b/OpenCL/m07500_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -450,7 +451,7 @@ KERNEL_FQ void m07500_m04 (KERN_ATTR_RULES_ESALT (krb5pa_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -556,7 +557,7 @@ KERNEL_FQ void m07500_s04 (KERN_ATTR_RULES_ESALT (krb5pa_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m07500_a0-pure.cl b/OpenCL/m07500_a0-pure.cl
index 5540ccde5..a26f705c1 100644
--- a/OpenCL/m07500_a0-pure.cl
+++ b/OpenCL/m07500_a0-pure.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -295,7 +296,7 @@ KERNEL_FQ void m07500_mxx (KERN_ATTR_RULES_ESALT (krb5pa_t))
 
   COPY_PW (pws[gid]);
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -366,7 +367,7 @@ KERNEL_FQ void m07500_sxx (KERN_ATTR_RULES_ESALT (krb5pa_t))
 
   COPY_PW (pws[gid]);
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m07500_a1-optimized.cl b/OpenCL/m07500_a1-optimized.cl
index 7b0746d91..ddcc7a51a 100644
--- a/OpenCL/m07500_a1-optimized.cl
+++ b/OpenCL/m07500_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -448,7 +449,7 @@ KERNEL_FQ void m07500_m04 (KERN_ATTR_ESALT (krb5pa_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -604,7 +605,7 @@ KERNEL_FQ void m07500_s04 (KERN_ATTR_ESALT (krb5pa_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m07500_a1-pure.cl b/OpenCL/m07500_a1-pure.cl
index 6ec806490..4a812ac9e 100644
--- a/OpenCL/m07500_a1-pure.cl
+++ b/OpenCL/m07500_a1-pure.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md4.cl"
 #include "inc_hash_md5.cl"
@@ -291,7 +292,7 @@ KERNEL_FQ void m07500_mxx (KERN_ATTR_ESALT (krb5pa_t))
    * base
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -360,7 +361,7 @@ KERNEL_FQ void m07500_sxx (KERN_ATTR_ESALT (krb5pa_t))
    * base
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m07500_a3-optimized.cl b/OpenCL/m07500_a3-optimized.cl
index 5f51f887e..68e154486 100644
--- a/OpenCL/m07500_a3-optimized.cl
+++ b/OpenCL/m07500_a3-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -527,7 +528,7 @@ KERNEL_FQ void m07500_m04 (KERN_ATTR_ESALT (krb5pa_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -579,7 +580,7 @@ KERNEL_FQ void m07500_m08 (KERN_ATTR_ESALT (krb5pa_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -635,7 +636,7 @@ KERNEL_FQ void m07500_s04 (KERN_ATTR_ESALT (krb5pa_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -687,7 +688,7 @@ KERNEL_FQ void m07500_s08 (KERN_ATTR_ESALT (krb5pa_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m07500_a3-pure.cl b/OpenCL/m07500_a3-pure.cl
index 02749c3e5..e6a6c4b01 100644
--- a/OpenCL/m07500_a3-pure.cl
+++ b/OpenCL/m07500_a3-pure.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md4.cl"
 #include "inc_hash_md5.cl"
@@ -312,7 +313,7 @@ KERNEL_FQ void m07500_mxx (KERN_ATTR_VECTOR_ESALT (krb5pa_t))
     w[idx] = pws[gid].i[idx];
   }
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -406,7 +407,7 @@ KERNEL_FQ void m07500_sxx (KERN_ATTR_VECTOR_ESALT (krb5pa_t))
     w[idx] = pws[gid].i[idx];
   }
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m07700_a0-optimized.cl b/OpenCL/m07700_a0-optimized.cl
index 5a530f8db..2c633028b 100644
--- a/OpenCL/m07700_a0-optimized.cl
+++ b/OpenCL/m07700_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -28,7 +29,7 @@
   (a)[((n)/4)+1]  = x >> 32;        \
 }
 
-CONSTANT_AS u32a sapb_trans_tbl[256] =
+CONSTANT_VK u32a sapb_trans_tbl[256] =
 {
   // first value hack for 0 byte as part of an optimization
   0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -49,7 +50,7 @@ CONSTANT_AS u32a sapb_trans_tbl[256] =
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 };
 
-CONSTANT_AS u32a bcodeArray[48] =
+CONSTANT_VK u32a bcodeArray[48] =
 {
   0x14, 0x77, 0xf3, 0xd4, 0xbb, 0x71, 0x23, 0xd0, 0x03, 0xff, 0x47, 0x93, 0x55, 0xaa, 0x66, 0x91,
   0xf2, 0x88, 0x6b, 0x99, 0xbf, 0xcb, 0x32, 0x1a, 0x19, 0xd9, 0xa7, 0x82, 0x22, 0x49, 0xa2, 0x51,
diff --git a/OpenCL/m07700_a1-optimized.cl b/OpenCL/m07700_a1-optimized.cl
index 86dd31267..525d9c257 100644
--- a/OpenCL/m07700_a1-optimized.cl
+++ b/OpenCL/m07700_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -26,7 +27,7 @@
   (a)[((n)/4)+1]  = x >> 32;        \
 }
 
-CONSTANT_AS u32a sapb_trans_tbl[256] =
+CONSTANT_VK u32a sapb_trans_tbl[256] =
 {
   // first value hack for 0 byte as part of an optimization
   0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -47,7 +48,7 @@ CONSTANT_AS u32a sapb_trans_tbl[256] =
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 };
 
-CONSTANT_AS u32a bcodeArray[48] =
+CONSTANT_VK u32a bcodeArray[48] =
 {
   0x14, 0x77, 0xf3, 0xd4, 0xbb, 0x71, 0x23, 0xd0, 0x03, 0xff, 0x47, 0x93, 0x55, 0xaa, 0x66, 0x91,
   0xf2, 0x88, 0x6b, 0x99, 0xbf, 0xcb, 0x32, 0x1a, 0x19, 0xd9, 0xa7, 0x82, 0x22, 0x49, 0xa2, 0x51,
diff --git a/OpenCL/m07700_a3-optimized.cl b/OpenCL/m07700_a3-optimized.cl
index a3f3d7a62..78927041b 100644
--- a/OpenCL/m07700_a3-optimized.cl
+++ b/OpenCL/m07700_a3-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -17,7 +18,7 @@
 #define GETCHAR(a,p)  (((a)[(p) / 4] >> (((p) & 3) * 8)) & 0xff)
 #define PUTCHAR(a,p,c) ((a)[(p) / 4] = (((a)[(p) / 4] & ~(0xff << (((p) & 3) * 8))) | ((c) << (((p) & 3) * 8))))
 
-CONSTANT_AS u32a sapb_trans_tbl[256] =
+CONSTANT_VK u32a sapb_trans_tbl[256] =
 {
   // first value hack for 0 byte as part of an optimization
   0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -38,7 +39,7 @@ CONSTANT_AS u32a sapb_trans_tbl[256] =
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 };
 
-CONSTANT_AS u32a bcodeArray[48] =
+CONSTANT_VK u32a bcodeArray[48] =
 {
   0x14, 0x77, 0xf3, 0xd4, 0xbb, 0x71, 0x23, 0xd0, 0x03, 0xff, 0x47, 0x93, 0x55, 0xaa, 0x66, 0x91,
   0xf2, 0x88, 0x6b, 0x99, 0xbf, 0xcb, 0x32, 0x1a, 0x19, 0xd9, 0xa7, 0x82, 0x22, 0x49, 0xa2, 0x51,
diff --git a/OpenCL/m07701_a0-optimized.cl b/OpenCL/m07701_a0-optimized.cl
index 840842fb0..99fbb8183 100644
--- a/OpenCL/m07701_a0-optimized.cl
+++ b/OpenCL/m07701_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -28,7 +29,7 @@
   (a)[((n)/4)+1]  = x >> 32;        \
 }
 
-CONSTANT_AS u32a sapb_trans_tbl[256] =
+CONSTANT_VK u32a sapb_trans_tbl[256] =
 {
   // first value hack for 0 byte as part of an optimization
   0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -49,7 +50,7 @@ CONSTANT_AS u32a sapb_trans_tbl[256] =
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 };
 
-CONSTANT_AS u32a bcodeArray[48] =
+CONSTANT_VK u32a bcodeArray[48] =
 {
   0x14, 0x77, 0xf3, 0xd4, 0xbb, 0x71, 0x23, 0xd0, 0x03, 0xff, 0x47, 0x93, 0x55, 0xaa, 0x66, 0x91,
   0xf2, 0x88, 0x6b, 0x99, 0xbf, 0xcb, 0x32, 0x1a, 0x19, 0xd9, 0xa7, 0x82, 0x22, 0x49, 0xa2, 0x51,
diff --git a/OpenCL/m07701_a1-optimized.cl b/OpenCL/m07701_a1-optimized.cl
index f73364987..04609d8e0 100644
--- a/OpenCL/m07701_a1-optimized.cl
+++ b/OpenCL/m07701_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -26,7 +27,7 @@
   (a)[((n)/4)+1]  = x >> 32;        \
 }
 
-CONSTANT_AS u32a sapb_trans_tbl[256] =
+CONSTANT_VK u32a sapb_trans_tbl[256] =
 {
   // first value hack for 0 byte as part of an optimization
   0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -47,7 +48,7 @@ CONSTANT_AS u32a sapb_trans_tbl[256] =
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 };
 
-CONSTANT_AS u32a bcodeArray[48] =
+CONSTANT_VK u32a bcodeArray[48] =
 {
   0x14, 0x77, 0xf3, 0xd4, 0xbb, 0x71, 0x23, 0xd0, 0x03, 0xff, 0x47, 0x93, 0x55, 0xaa, 0x66, 0x91,
   0xf2, 0x88, 0x6b, 0x99, 0xbf, 0xcb, 0x32, 0x1a, 0x19, 0xd9, 0xa7, 0x82, 0x22, 0x49, 0xa2, 0x51,
diff --git a/OpenCL/m07701_a3-optimized.cl b/OpenCL/m07701_a3-optimized.cl
index b32c65f24..506ca491f 100644
--- a/OpenCL/m07701_a3-optimized.cl
+++ b/OpenCL/m07701_a3-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -17,7 +18,7 @@
 #define GETCHAR(a,p)  (((a)[(p) / 4] >> (((p) & 3) * 8)) & 0xff)
 #define PUTCHAR(a,p,c) ((a)[(p) / 4] = (((a)[(p) / 4] & ~(0xff << (((p) & 3) * 8))) | ((c) << (((p) & 3) * 8))))
 
-CONSTANT_AS u32a sapb_trans_tbl[256] =
+CONSTANT_VK u32a sapb_trans_tbl[256] =
 {
   // first value hack for 0 byte as part of an optimization
   0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -38,7 +39,7 @@ CONSTANT_AS u32a sapb_trans_tbl[256] =
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 };
 
-CONSTANT_AS u32a bcodeArray[48] =
+CONSTANT_VK u32a bcodeArray[48] =
 {
   0x14, 0x77, 0xf3, 0xd4, 0xbb, 0x71, 0x23, 0xd0, 0x03, 0xff, 0x47, 0x93, 0x55, 0xaa, 0x66, 0x91,
   0xf2, 0x88, 0x6b, 0x99, 0xbf, 0xcb, 0x32, 0x1a, 0x19, 0xd9, 0xa7, 0x82, 0x22, 0x49, 0xa2, 0x51,
diff --git a/OpenCL/m07800_a0-optimized.cl b/OpenCL/m07800_a0-optimized.cl
index 7c6951d9b..4d86b2858 100644
--- a/OpenCL/m07800_a0-optimized.cl
+++ b/OpenCL/m07800_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,7 +17,7 @@
 #include "inc_hash_sha1.cl"
 #endif
 
-CONSTANT_AS u32a theMagicArray[64] =
+CONSTANT_VK u32a theMagicArray[64] =
 {
   0x91ac5114, 0x9f675443, 0x24e73be0, 0x28747bc2, 0x863313eb, 0x5a4fcb5c, 0x080a7337, 0x0e5d1c2f,
   0x338fe6e5, 0xf89baedd, 0x16f24b8d, 0x2ce1d4dc, 0xb0cbdf9d, 0xd4706d17, 0xf94d423f, 0x9b1b1194,
diff --git a/OpenCL/m07800_a1-optimized.cl b/OpenCL/m07800_a1-optimized.cl
index 856e32bdd..cc552143f 100644
--- a/OpenCL/m07800_a1-optimized.cl
+++ b/OpenCL/m07800_a1-optimized.cl
@@ -9,12 +9,13 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
-CONSTANT_AS u32a theMagicArray[64] =
+CONSTANT_VK u32a theMagicArray[64] =
 {
   0x91ac5114, 0x9f675443, 0x24e73be0, 0x28747bc2, 0x863313eb, 0x5a4fcb5c, 0x080a7337, 0x0e5d1c2f,
   0x338fe6e5, 0xf89baedd, 0x16f24b8d, 0x2ce1d4dc, 0xb0cbdf9d, 0xd4706d17, 0xf94d423f, 0x9b1b1194,
diff --git a/OpenCL/m07800_a3-optimized.cl b/OpenCL/m07800_a3-optimized.cl
index 3e0363867..de22475f0 100644
--- a/OpenCL/m07800_a3-optimized.cl
+++ b/OpenCL/m07800_a3-optimized.cl
@@ -9,12 +9,13 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
-CONSTANT_AS u32a theMagicArray[64] =
+CONSTANT_VK u32a theMagicArray[64] =
 {
   0x91ac5114, 0x9f675443, 0x24e73be0, 0x28747bc2, 0x863313eb, 0x5a4fcb5c, 0x080a7337, 0x0e5d1c2f,
   0x338fe6e5, 0xf89baedd, 0x16f24b8d, 0x2ce1d4dc, 0xb0cbdf9d, 0xd4706d17, 0xf94d423f, 0x9b1b1194,
diff --git a/OpenCL/m07801_a0-optimized.cl b/OpenCL/m07801_a0-optimized.cl
index 5a5b224a7..aeb9b95d0 100644
--- a/OpenCL/m07801_a0-optimized.cl
+++ b/OpenCL/m07801_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,7 +17,7 @@
 #include "inc_hash_sha1.cl"
 #endif
 
-CONSTANT_AS u32a theMagicArray[64] =
+CONSTANT_VK u32a theMagicArray[64] =
 {
   0x91ac5114, 0x9f675443, 0x24e73be0, 0x28747bc2, 0x863313eb, 0x5a4fcb5c, 0x080a7337, 0x0e5d1c2f,
   0x338fe6e5, 0xf89baedd, 0x16f24b8d, 0x2ce1d4dc, 0xb0cbdf9d, 0xd4706d17, 0xf94d423f, 0x9b1b1194,
diff --git a/OpenCL/m07801_a1-optimized.cl b/OpenCL/m07801_a1-optimized.cl
index 52fbf44c8..8d055619c 100644
--- a/OpenCL/m07801_a1-optimized.cl
+++ b/OpenCL/m07801_a1-optimized.cl
@@ -9,12 +9,13 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
-CONSTANT_AS u32a theMagicArray[64] =
+CONSTANT_VK u32a theMagicArray[64] =
 {
   0x91ac5114, 0x9f675443, 0x24e73be0, 0x28747bc2, 0x863313eb, 0x5a4fcb5c, 0x080a7337, 0x0e5d1c2f,
   0x338fe6e5, 0xf89baedd, 0x16f24b8d, 0x2ce1d4dc, 0xb0cbdf9d, 0xd4706d17, 0xf94d423f, 0x9b1b1194,
diff --git a/OpenCL/m07801_a3-optimized.cl b/OpenCL/m07801_a3-optimized.cl
index 47765cbbb..7de9f2a11 100644
--- a/OpenCL/m07801_a3-optimized.cl
+++ b/OpenCL/m07801_a3-optimized.cl
@@ -9,12 +9,13 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
-CONSTANT_AS u32a theMagicArray[64] =
+CONSTANT_VK u32a theMagicArray[64] =
 {
   0x91ac5114, 0x9f675443, 0x24e73be0, 0x28747bc2, 0x863313eb, 0x5a4fcb5c, 0x080a7337, 0x0e5d1c2f,
   0x338fe6e5, 0xf89baedd, 0x16f24b8d, 0x2ce1d4dc, 0xb0cbdf9d, 0xd4706d17, 0xf94d423f, 0x9b1b1194,
diff --git a/OpenCL/m07900-pure.cl b/OpenCL/m07900-pure.cl
index a6eed5cb5..32b381d33 100644
--- a/OpenCL/m07900-pure.cl
+++ b/OpenCL/m07900-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha512.cl"
 #endif
diff --git a/OpenCL/m08000_a0-optimized.cl b/OpenCL/m08000_a0-optimized.cl
index c3f3e6edf..310bebbeb 100644
--- a/OpenCL/m08000_a0-optimized.cl
+++ b/OpenCL/m08000_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -85,6 +86,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -92,6 +98,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
@@ -136,6 +143,11 @@ DECLSPEC void sha256_transform_z (u32x *digest)
 
   ROUND_STEP_Z (0);
 
+  #ifdef IS_CUDA
+  ROUND_STEP_Z (16);
+  ROUND_STEP_Z (32);
+  ROUND_STEP_Z (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -143,6 +155,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
   {
     ROUND_STEP_Z (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
@@ -227,8 +240,8 @@ KERNEL_FQ void m08000_m04 (KERN_ATTR_RULES ())
    * precompute final msg blocks
    */
 
-  LOCAL_AS u32 w_s1[64];
-  LOCAL_AS u32 w_s2[64];
+  LOCAL_VK u32 w_s1[64];
+  LOCAL_VK u32 w_s2[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -236,7 +249,7 @@ KERNEL_FQ void m08000_m04 (KERN_ATTR_RULES ())
     w_s2[i] = 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (lid == 0)
   {
@@ -264,7 +277,7 @@ KERNEL_FQ void m08000_m04 (KERN_ATTR_RULES ())
     }
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -398,8 +411,8 @@ KERNEL_FQ void m08000_s04 (KERN_ATTR_RULES ())
    * precompute final msg blocks
    */
 
-  LOCAL_AS u32 w_s1[64];
-  LOCAL_AS u32 w_s2[64];
+  LOCAL_VK u32 w_s1[64];
+  LOCAL_VK u32 w_s2[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -407,7 +420,7 @@ KERNEL_FQ void m08000_s04 (KERN_ATTR_RULES ())
     w_s2[i] = 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (lid == 0)
   {
@@ -435,7 +448,7 @@ KERNEL_FQ void m08000_s04 (KERN_ATTR_RULES ())
     }
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08000_a1-optimized.cl b/OpenCL/m08000_a1-optimized.cl
index defb4c2f0..89ea42a57 100644
--- a/OpenCL/m08000_a1-optimized.cl
+++ b/OpenCL/m08000_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -83,6 +84,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +96,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
@@ -134,6 +141,11 @@ DECLSPEC void sha256_transform_z (u32x *digest)
 
   ROUND_STEP_Z (0);
 
+  #ifdef IS_CUDA
+  ROUND_STEP_Z (16);
+  ROUND_STEP_Z (32);
+  ROUND_STEP_Z (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -141,6 +153,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
   {
     ROUND_STEP_Z (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
@@ -225,8 +238,8 @@ KERNEL_FQ void m08000_m04 (KERN_ATTR_BASIC ())
    * precompute final msg blocks
    */
 
-  LOCAL_AS u32 w_s1[64];
-  LOCAL_AS u32 w_s2[64];
+  LOCAL_VK u32 w_s1[64];
+  LOCAL_VK u32 w_s2[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -234,7 +247,7 @@ KERNEL_FQ void m08000_m04 (KERN_ATTR_BASIC ())
     w_s2[i] = 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (lid == 0)
   {
@@ -262,7 +275,7 @@ KERNEL_FQ void m08000_m04 (KERN_ATTR_BASIC ())
     }
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -450,8 +463,8 @@ KERNEL_FQ void m08000_s04 (KERN_ATTR_BASIC ())
    * precompute final msg blocks
    */
 
-  LOCAL_AS u32 w_s1[64];
-  LOCAL_AS u32 w_s2[64];
+  LOCAL_VK u32 w_s1[64];
+  LOCAL_VK u32 w_s2[64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -459,7 +472,7 @@ KERNEL_FQ void m08000_s04 (KERN_ATTR_BASIC ())
     w_s2[i] = 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (lid == 0)
   {
@@ -487,7 +500,7 @@ KERNEL_FQ void m08000_s04 (KERN_ATTR_BASIC ())
     }
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08000_a3-optimized.cl b/OpenCL/m08000_a3-optimized.cl
index 1439bce9e..fa76a3b72 100644
--- a/OpenCL/m08000_a3-optimized.cl
+++ b/OpenCL/m08000_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -83,6 +84,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +96,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
@@ -134,6 +141,11 @@ DECLSPEC void sha256_transform_z (u32x *digest)
 
   ROUND_STEP_Z (0);
 
+  #ifdef IS_CUDA
+  ROUND_STEP_Z (16);
+  ROUND_STEP_Z (32);
+  ROUND_STEP_Z (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -141,6 +153,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
   {
     ROUND_STEP_Z (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
@@ -231,7 +244,7 @@ DECLSPEC void m08000m (LOCAL_AS u32 *w_s1, LOCAL_AS u32 *w_s2, u32 *w, const u32
     w_s2[i] = 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (lid == 0)
   {
@@ -259,7 +272,7 @@ DECLSPEC void m08000m (LOCAL_AS u32 *w_s1, LOCAL_AS u32 *w_s2, u32 *w, const u32
     }
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -353,7 +366,7 @@ DECLSPEC void m08000s (LOCAL_AS u32 *w_s1, LOCAL_AS u32 *w_s2, u32 *w, const u32
     w_s2[i] = 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (lid == 0)
   {
@@ -381,7 +394,7 @@ DECLSPEC void m08000s (LOCAL_AS u32 *w_s1, LOCAL_AS u32 *w_s2, u32 *w, const u32
     }
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -488,8 +501,8 @@ KERNEL_FQ void m08000_m04 (KERN_ATTR_VECTOR ())
 
   const u32 pw_len = pws[gid].pw_len & 63;
 
-  LOCAL_AS u32 w_s1[64];
-  LOCAL_AS u32 w_s2[64];
+  LOCAL_VK u32 w_s1[64];
+  LOCAL_VK u32 w_s2[64];
 
   /**
    * main
@@ -527,8 +540,8 @@ KERNEL_FQ void m08000_m08 (KERN_ATTR_VECTOR ())
 
   const u32 pw_len = pws[gid].pw_len & 63;
 
-  LOCAL_AS u32 w_s1[64];
-  LOCAL_AS u32 w_s2[64];
+  LOCAL_VK u32 w_s1[64];
+  LOCAL_VK u32 w_s2[64];
 
   /**
    * main
@@ -566,8 +579,8 @@ KERNEL_FQ void m08000_m16 (KERN_ATTR_VECTOR ())
 
   const u32 pw_len = pws[gid].pw_len & 63;
 
-  LOCAL_AS u32 w_s1[64];
-  LOCAL_AS u32 w_s2[64];
+  LOCAL_VK u32 w_s1[64];
+  LOCAL_VK u32 w_s2[64];
 
   /**
    * main
@@ -605,8 +618,8 @@ KERNEL_FQ void m08000_s04 (KERN_ATTR_VECTOR ())
 
   const u32 pw_len = pws[gid].pw_len & 63;
 
-  LOCAL_AS u32 w_s1[64];
-  LOCAL_AS u32 w_s2[64];
+  LOCAL_VK u32 w_s1[64];
+  LOCAL_VK u32 w_s2[64];
 
   /**
    * main
@@ -644,8 +657,8 @@ KERNEL_FQ void m08000_s08 (KERN_ATTR_VECTOR ())
 
   const u32 pw_len = pws[gid].pw_len & 63;
 
-  LOCAL_AS u32 w_s1[64];
-  LOCAL_AS u32 w_s2[64];
+  LOCAL_VK u32 w_s1[64];
+  LOCAL_VK u32 w_s2[64];
 
   /**
    * main
@@ -683,8 +696,8 @@ KERNEL_FQ void m08000_s16 (KERN_ATTR_VECTOR ())
 
   const u32 pw_len = pws[gid].pw_len & 63;
 
-  LOCAL_AS u32 w_s1[64];
-  LOCAL_AS u32 w_s2[64];
+  LOCAL_VK u32 w_s1[64];
+  LOCAL_VK u32 w_s2[64];
 
   /**
    * main
diff --git a/OpenCL/m08100_a0-optimized.cl b/OpenCL/m08100_a0-optimized.cl
index 5a3f1fb5b..95b650849 100644
--- a/OpenCL/m08100_a0-optimized.cl
+++ b/OpenCL/m08100_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m08100_a0-pure.cl b/OpenCL/m08100_a0-pure.cl
index 09dfa8c41..d76790540 100644
--- a/OpenCL/m08100_a0-pure.cl
+++ b/OpenCL/m08100_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m08100_a1-optimized.cl b/OpenCL/m08100_a1-optimized.cl
index 228e38c11..ccd25c064 100644
--- a/OpenCL/m08100_a1-optimized.cl
+++ b/OpenCL/m08100_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m08100_a1-pure.cl b/OpenCL/m08100_a1-pure.cl
index 041171d16..f67f51120 100644
--- a/OpenCL/m08100_a1-pure.cl
+++ b/OpenCL/m08100_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m08100_a3-optimized.cl b/OpenCL/m08100_a3-optimized.cl
index 076a10dcf..4e2f5e8ad 100644
--- a/OpenCL/m08100_a3-optimized.cl
+++ b/OpenCL/m08100_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m08100_a3-pure.cl b/OpenCL/m08100_a3-pure.cl
index 2582bc358..3d5208ad3 100644
--- a/OpenCL/m08100_a3-pure.cl
+++ b/OpenCL/m08100_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m08200-pure.cl b/OpenCL/m08200-pure.cl
index 9106b0d7d..96429c223 100644
--- a/OpenCL/m08200-pure.cl
+++ b/OpenCL/m08200-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m08300_a0-optimized.cl b/OpenCL/m08300_a0-optimized.cl
index 4591d7a24..d9461bd26 100644
--- a/OpenCL/m08300_a0-optimized.cl
+++ b/OpenCL/m08300_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m08300_a0-pure.cl b/OpenCL/m08300_a0-pure.cl
index c0dfc084b..7bf1156fd 100644
--- a/OpenCL/m08300_a0-pure.cl
+++ b/OpenCL/m08300_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m08300_a1-optimized.cl b/OpenCL/m08300_a1-optimized.cl
index 6f78de522..a669c739d 100644
--- a/OpenCL/m08300_a1-optimized.cl
+++ b/OpenCL/m08300_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m08300_a1-pure.cl b/OpenCL/m08300_a1-pure.cl
index 9958fc4e8..42135dec1 100644
--- a/OpenCL/m08300_a1-pure.cl
+++ b/OpenCL/m08300_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m08300_a3-optimized.cl b/OpenCL/m08300_a3-optimized.cl
index 18faae59b..0e9ad838f 100644
--- a/OpenCL/m08300_a3-optimized.cl
+++ b/OpenCL/m08300_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m08300_a3-pure.cl b/OpenCL/m08300_a3-pure.cl
index cac746e6d..4215d42a0 100644
--- a/OpenCL/m08300_a3-pure.cl
+++ b/OpenCL/m08300_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m08400_a0-optimized.cl b/OpenCL/m08400_a0-optimized.cl
index 21a973218..ea97818be 100644
--- a/OpenCL/m08400_a0-optimized.cl
+++ b/OpenCL/m08400_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m08400_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m08400_m04 (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m08400_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -295,7 +296,7 @@ KERNEL_FQ void m08400_s04 (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -306,7 +307,7 @@ KERNEL_FQ void m08400_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08400_a0-pure.cl b/OpenCL/m08400_a0-pure.cl
index eb3ac3cae..b88e9aa8a 100644
--- a/OpenCL/m08400_a0-pure.cl
+++ b/OpenCL/m08400_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m08400_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m08400_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m08400_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -192,7 +193,7 @@ KERNEL_FQ void m08400_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -203,7 +204,7 @@ KERNEL_FQ void m08400_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08400_a1-optimized.cl b/OpenCL/m08400_a1-optimized.cl
index 77c9a78e9..5d1ef435c 100644
--- a/OpenCL/m08400_a1-optimized.cl
+++ b/OpenCL/m08400_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m08400_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m08400_m04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m08400_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -351,7 +352,7 @@ KERNEL_FQ void m08400_s04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -362,7 +363,7 @@ KERNEL_FQ void m08400_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08400_a1-pure.cl b/OpenCL/m08400_a1-pure.cl
index 67e3cb552..b799d8faf 100644
--- a/OpenCL/m08400_a1-pure.cl
+++ b/OpenCL/m08400_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m08400_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m08400_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m08400_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -188,7 +189,7 @@ KERNEL_FQ void m08400_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -199,7 +200,7 @@ KERNEL_FQ void m08400_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08400_a3-optimized.cl b/OpenCL/m08400_a3-optimized.cl
index ba99799e7..96afc5f8a 100644
--- a/OpenCL/m08400_a3-optimized.cl
+++ b/OpenCL/m08400_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m08400m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -471,7 +472,7 @@ KERNEL_FQ void m08400_m04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -482,7 +483,7 @@ KERNEL_FQ void m08400_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -541,7 +542,7 @@ KERNEL_FQ void m08400_m08 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -552,7 +553,7 @@ KERNEL_FQ void m08400_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -611,7 +612,7 @@ KERNEL_FQ void m08400_m16 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -622,7 +623,7 @@ KERNEL_FQ void m08400_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -681,7 +682,7 @@ KERNEL_FQ void m08400_s04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -692,7 +693,7 @@ KERNEL_FQ void m08400_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -751,7 +752,7 @@ KERNEL_FQ void m08400_s08 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -762,7 +763,7 @@ KERNEL_FQ void m08400_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -821,7 +822,7 @@ KERNEL_FQ void m08400_s16 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -832,7 +833,7 @@ KERNEL_FQ void m08400_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08400_a3-pure.cl b/OpenCL/m08400_a3-pure.cl
index 47847863c..0e224662d 100644
--- a/OpenCL/m08400_a3-pure.cl
+++ b/OpenCL/m08400_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m08400_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m08400_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m08400_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -205,7 +206,7 @@ KERNEL_FQ void m08400_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -216,7 +217,7 @@ KERNEL_FQ void m08400_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08500_a0-pure.cl b/OpenCL/m08500_a0-pure.cl
index 04cefa798..03f57c0ad 100644
--- a/OpenCL/m08500_a0-pure.cl
+++ b/OpenCL/m08500_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -52,7 +53,7 @@
   PERM_OP (l, r, tt,  4, 0x0f0f0f0f);  \
 }
 
-CONSTANT_AS u32a c_ascii_to_ebcdic_pc[256] =
+CONSTANT_VK u32a c_ascii_to_ebcdic_pc[256] =
 {
   // little hack, can't crack 0-bytes in password, but who cares
   //    0xab, 0xa8, 0xae, 0xad, 0xc4, 0xf1, 0xf7, 0xf4, 0x86, 0xa1, 0xe0, 0xbc, 0xb3, 0xb0, 0xb6, 0xb5,
@@ -74,7 +75,7 @@ CONSTANT_AS u32a c_ascii_to_ebcdic_pc[256] =
   0x13, 0x10, 0x16, 0x15, 0x7f, 0x7c, 0x73, 0x70, 0x76, 0x75, 0x5e, 0x5d, 0x52, 0x51, 0x57, 0x54,
 };
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -222,7 +223,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -373,25 +374,25 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #if   VECT_SIZE == 1
 #define BOX1(i,S) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32 *iv, u32x *data, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -535,8 +536,8 @@ KERNEL_FQ void m08500_mxx (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -559,7 +560,7 @@ KERNEL_FQ void m08500_mxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -633,8 +634,8 @@ KERNEL_FQ void m08500_sxx (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -657,7 +658,7 @@ KERNEL_FQ void m08500_sxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08500_a1-pure.cl b/OpenCL/m08500_a1-pure.cl
index d7b02244b..2ca27bd1f 100644
--- a/OpenCL/m08500_a1-pure.cl
+++ b/OpenCL/m08500_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
@@ -50,7 +51,7 @@
   PERM_OP (l, r, tt,  4, 0x0f0f0f0f);  \
 }
 
-CONSTANT_AS u32a c_ascii_to_ebcdic_pc[256] =
+CONSTANT_VK u32a c_ascii_to_ebcdic_pc[256] =
 {
   // little hack, can't crack 0-bytes in password, but who cares
   //    0xab, 0xa8, 0xae, 0xad, 0xc4, 0xf1, 0xf7, 0xf4, 0x86, 0xa1, 0xe0, 0xbc, 0xb3, 0xb0, 0xb6, 0xb5,
@@ -72,7 +73,7 @@ CONSTANT_AS u32a c_ascii_to_ebcdic_pc[256] =
   0x13, 0x10, 0x16, 0x15, 0x7f, 0x7c, 0x73, 0x70, 0x76, 0x75, 0x5e, 0x5d, 0x52, 0x51, 0x57, 0x54,
 };
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -220,7 +221,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -371,25 +372,25 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #if   VECT_SIZE == 1
 #define BOX1(i,S) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32 *iv, u32x *data, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -533,8 +534,8 @@ KERNEL_FQ void m08500_mxx (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -557,7 +558,7 @@ KERNEL_FQ void m08500_mxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -689,8 +690,8 @@ KERNEL_FQ void m08500_sxx (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -713,7 +714,7 @@ KERNEL_FQ void m08500_sxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08500_a3-pure.cl b/OpenCL/m08500_a3-pure.cl
index 8ae9721c5..36f27f2fd 100644
--- a/OpenCL/m08500_a3-pure.cl
+++ b/OpenCL/m08500_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
@@ -50,7 +51,7 @@
   PERM_OP (l, r, tt,  4, 0x0f0f0f0f);  \
 }
 
-CONSTANT_AS u32a c_ascii_to_ebcdic_pc[256] =
+CONSTANT_VK u32a c_ascii_to_ebcdic_pc[256] =
 {
   // little hack, can't crack 0-bytes in password, but who cares
   //    0xab, 0xa8, 0xae, 0xad, 0xc4, 0xf1, 0xf7, 0xf4, 0x86, 0xa1, 0xe0, 0xbc, 0xb3, 0xb0, 0xb6, 0xb5,
@@ -72,7 +73,7 @@ CONSTANT_AS u32a c_ascii_to_ebcdic_pc[256] =
   0x13, 0x10, 0x16, 0x15, 0x7f, 0x7c, 0x73, 0x70, 0x76, 0x75, 0x5e, 0x5d, 0x52, 0x51, 0x57, 0x54,
 };
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -220,7 +221,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -371,25 +372,25 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #if   VECT_SIZE == 1
 #define BOX1(i,S) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32x *iv, u32x *data, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -671,8 +672,8 @@ KERNEL_FQ void m08500_mxx (KERN_ATTR_VECTOR ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -695,7 +696,7 @@ KERNEL_FQ void m08500_mxx (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -745,8 +746,8 @@ KERNEL_FQ void m08500_sxx (KERN_ATTR_VECTOR ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -769,7 +770,7 @@ KERNEL_FQ void m08500_sxx (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08600_a0-pure.cl b/OpenCL/m08600_a0-pure.cl
index c88b30787..486362174 100644
--- a/OpenCL/m08600_a0-pure.cl
+++ b/OpenCL/m08600_a0-pure.cl
@@ -9,13 +9,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a lotus_magic_table[256] =
+CONSTANT_VK u32a lotus_magic_table[256] =
 {
   0xbd, 0x56, 0xea, 0xf2, 0xa2, 0xf1, 0xac, 0x2a,
   0xb0, 0x93, 0xd1, 0x9c, 0x1b, 0x33, 0xfd, 0xd0,
@@ -54,13 +55,13 @@ CONSTANT_AS u32a lotus_magic_table[256] =
 #if   VECT_SIZE == 1
 #define BOX1(S,i) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void lotus_mix (u32 *in, LOCAL_AS u32 *s_lotus_magic_table)
@@ -243,14 +244,14 @@ KERNEL_FQ void m08600_mxx (KERN_ATTR_RULES ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -307,14 +308,14 @@ KERNEL_FQ void m08600_sxx (KERN_ATTR_RULES ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08600_a1-pure.cl b/OpenCL/m08600_a1-pure.cl
index 5253d21dd..1260a50cf 100644
--- a/OpenCL/m08600_a1-pure.cl
+++ b/OpenCL/m08600_a1-pure.cl
@@ -9,11 +9,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a lotus_magic_table[256] =
+CONSTANT_VK u32a lotus_magic_table[256] =
 {
   0xbd, 0x56, 0xea, 0xf2, 0xa2, 0xf1, 0xac, 0x2a,
   0xb0, 0x93, 0xd1, 0x9c, 0x1b, 0x33, 0xfd, 0xd0,
@@ -52,13 +53,13 @@ CONSTANT_AS u32a lotus_magic_table[256] =
 #if   VECT_SIZE == 1
 #define BOX1(S,i) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void lotus_mix (u32 *in, LOCAL_AS u32 *s_lotus_magic_table)
@@ -241,14 +242,14 @@ KERNEL_FQ void m08600_mxx (KERN_ATTR_BASIC ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -365,14 +366,14 @@ KERNEL_FQ void m08600_sxx (KERN_ATTR_BASIC ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08600_a3-pure.cl b/OpenCL/m08600_a3-pure.cl
index cd596b5c0..fc58fcc34 100644
--- a/OpenCL/m08600_a3-pure.cl
+++ b/OpenCL/m08600_a3-pure.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a lotus_magic_table[256] =
+CONSTANT_VK u32a lotus_magic_table[256] =
 {
   0xbd, 0x56, 0xea, 0xf2, 0xa2, 0xf1, 0xac, 0x2a,
   0xb0, 0x93, 0xd1, 0x9c, 0x1b, 0x33, 0xfd, 0xd0,
@@ -51,13 +52,13 @@ CONSTANT_AS u32a lotus_magic_table[256] =
 #if   VECT_SIZE == 1
 #define BOX1(S,i) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void lotus_mix (u32x *in, LOCAL_AS u32 *s_lotus_magic_table)
@@ -346,14 +347,14 @@ KERNEL_FQ void m08600_mxx (KERN_ATTR_VECTOR ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -403,14 +404,14 @@ KERNEL_FQ void m08600_sxx (KERN_ATTR_VECTOR ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08700_a0-optimized.cl b/OpenCL/m08700_a0-optimized.cl
index 6cb6a3b9f..2922cfb39 100644
--- a/OpenCL/m08700_a0-optimized.cl
+++ b/OpenCL/m08700_a0-optimized.cl
@@ -9,13 +9,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a lotus_magic_table[256] =
+CONSTANT_VK u32a lotus_magic_table[256] =
 {
   0xbd, 0x56, 0xea, 0xf2, 0xa2, 0xf1, 0xac, 0x2a,
   0xb0, 0x93, 0xd1, 0x9c, 0x1b, 0x33, 0xfd, 0xd0,
@@ -52,27 +53,27 @@ CONSTANT_AS u32a lotus_magic_table[256] =
 };
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 #if   VECT_SIZE == 1
 #define BOX1(S,i) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void lotus_mix (u32x *in, LOCAL_AS u32 *s_lotus_magic_table)
@@ -280,14 +281,14 @@ KERNEL_FQ void m08700_m04 (KERN_ATTR_RULES ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -298,7 +299,7 @@ KERNEL_FQ void m08700_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -467,14 +468,14 @@ KERNEL_FQ void m08700_s04 (KERN_ATTR_RULES ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -485,7 +486,7 @@ KERNEL_FQ void m08700_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08700_a1-optimized.cl b/OpenCL/m08700_a1-optimized.cl
index 2041e3f5d..02d3873c3 100644
--- a/OpenCL/m08700_a1-optimized.cl
+++ b/OpenCL/m08700_a1-optimized.cl
@@ -9,11 +9,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a lotus_magic_table[256] =
+CONSTANT_VK u32a lotus_magic_table[256] =
 {
   0xbd, 0x56, 0xea, 0xf2, 0xa2, 0xf1, 0xac, 0x2a,
   0xb0, 0x93, 0xd1, 0x9c, 0x1b, 0x33, 0xfd, 0xd0,
@@ -50,27 +51,27 @@ CONSTANT_AS u32a lotus_magic_table[256] =
 };
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 #if   VECT_SIZE == 1
 #define BOX1(S,i) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void lotus_mix (u32x *in, LOCAL_AS u32 *s_lotus_magic_table)
@@ -278,14 +279,14 @@ KERNEL_FQ void m08700_m04 (KERN_ATTR_BASIC ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -296,7 +297,7 @@ KERNEL_FQ void m08700_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -525,14 +526,14 @@ KERNEL_FQ void m08700_s04 (KERN_ATTR_BASIC ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -543,7 +544,7 @@ KERNEL_FQ void m08700_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08700_a3-optimized.cl b/OpenCL/m08700_a3-optimized.cl
index 897bc16c5..f15b3722d 100644
--- a/OpenCL/m08700_a3-optimized.cl
+++ b/OpenCL/m08700_a3-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a lotus_magic_table[256] =
+CONSTANT_VK u32a lotus_magic_table[256] =
 {
   0xbd, 0x56, 0xea, 0xf2, 0xa2, 0xf1, 0xac, 0x2a,
   0xb0, 0x93, 0xd1, 0x9c, 0x1b, 0x33, 0xfd, 0xd0,
@@ -51,27 +52,27 @@ CONSTANT_AS u32a lotus_magic_table[256] =
 #define BOX(S,i) (S)[(i)]
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 #if   VECT_SIZE == 1
 #define BOX1(S,i) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(S,i) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void lotus_mix (u32x *in, LOCAL_AS u32 *s_lotus_magic_table)
@@ -557,14 +558,14 @@ KERNEL_FQ void m08700_m04 (KERN_ATTR_VECTOR ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -575,7 +576,7 @@ KERNEL_FQ void m08700_m04 (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -625,14 +626,14 @@ KERNEL_FQ void m08700_m08 (KERN_ATTR_VECTOR ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -643,7 +644,7 @@ KERNEL_FQ void m08700_m08 (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -693,14 +694,14 @@ KERNEL_FQ void m08700_m16 (KERN_ATTR_VECTOR ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -711,7 +712,7 @@ KERNEL_FQ void m08700_m16 (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -761,14 +762,14 @@ KERNEL_FQ void m08700_s04 (KERN_ATTR_VECTOR ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -779,7 +780,7 @@ KERNEL_FQ void m08700_s04 (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -829,14 +830,14 @@ KERNEL_FQ void m08700_s08 (KERN_ATTR_VECTOR ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -847,7 +848,7 @@ KERNEL_FQ void m08700_s08 (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -897,14 +898,14 @@ KERNEL_FQ void m08700_s16 (KERN_ATTR_VECTOR ())
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -915,7 +916,7 @@ KERNEL_FQ void m08700_s16 (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m08800-pure.cl b/OpenCL/m08800-pure.cl
index c8e5381e6..faf8d729c 100644
--- a/OpenCL/m08800-pure.cl
+++ b/OpenCL/m08800-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -236,17 +237,17 @@ KERNEL_FQ void m08800_comp (KERN_ATTR_TMPS_ESALT (androidfde_tmp_t, androidfde_t
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -263,7 +264,7 @@ KERNEL_FQ void m08800_comp (KERN_ATTR_TMPS_ESALT (androidfde_tmp_t, androidfde_t
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m08900-pure.cl b/OpenCL/m08900-pure.cl
index dfec8b535..c5242d18e 100644
--- a/OpenCL/m08900-pure.cl
+++ b/OpenCL/m08900-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha256.cl"
 #endif
@@ -23,6 +24,23 @@ typedef struct
 
 } scrypt_tmp_t;
 
+#ifdef IS_CUDA
+
+inline __device__ uint4 operator &  (const uint4  a, const u32   b) { return make_uint4 ((a.x &  b  ), (a.y &  b  ), (a.z &  b  ), (a.w &  b  ));  }
+inline __device__ uint4 operator << (const uint4  a, const u32   b) { return make_uint4 ((a.x << b  ), (a.y << b  ), (a.z << b  ), (a.w << b  ));  }
+inline __device__ uint4 operator >> (const uint4  a, const u32   b) { return make_uint4 ((a.x >> b  ), (a.y >> b  ), (a.z >> b  ), (a.w >> b  ));  }
+inline __device__ uint4 operator +  (const uint4  a, const uint4 b) { return make_uint4 ((a.x +  b.x), (a.y +  b.y), (a.z +  b.z), (a.w +  b.w));  }
+inline __device__ uint4 operator ^  (const uint4  a, const uint4 b) { return make_uint4 ((a.x ^  b.x), (a.y ^  b.y), (a.z ^  b.z), (a.w ^  b.w));  }
+inline __device__ uint4 operator |  (const uint4  a, const uint4 b) { return make_uint4 ((a.x |  b.x), (a.y |  b.y), (a.z |  b.z), (a.w |  b.w));  }
+inline __device__ uint4 operator ^= (      uint4 &a, const uint4 b) {                     a.x ^= b.x;   a.y ^= b.y;   a.z ^= b.z;   a.w ^= b.w;    }
+
+inline __device__ uint4 rotate (const uint4 a, const int n)
+{
+  return ((a << n) | ((a >> (32 - n))));
+}
+
+#endif
+
 DECLSPEC uint4 hc_swap32_4 (uint4 v)
 {
   return (rotate ((v & 0x00FF00FF), 24u) | rotate ((v & 0xFF00FF00),  8u));
@@ -39,26 +57,50 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
 
 #define ADD_ROTATE_XOR(r,i1,i2,s) (r) ^= rotate ((i1) + (i2), (s));
 
-#define SALSA20_2R()                \
-{                                   \
-  ADD_ROTATE_XOR (X1, X0, X3,  7);  \
-  ADD_ROTATE_XOR (X2, X1, X0,  9);  \
-  ADD_ROTATE_XOR (X3, X2, X1, 13);  \
-  ADD_ROTATE_XOR (X0, X3, X2, 18);  \
-                                    \
-  X1 = X1.s3012;                    \
-  X2 = X2.s2301;                    \
-  X3 = X3.s1230;                    \
-                                    \
-  ADD_ROTATE_XOR (X3, X0, X1,  7);  \
-  ADD_ROTATE_XOR (X2, X3, X0,  9);  \
-  ADD_ROTATE_XOR (X1, X2, X3, 13);  \
-  ADD_ROTATE_XOR (X0, X1, X2, 18);  \
-                                    \
-  X1 = X1.s1230;                    \
-  X2 = X2.s2301;                    \
-  X3 = X3.s3012;                    \
+#ifdef IS_CUDA
+
+#define SALSA20_2R()                        \
+{                                           \
+  ADD_ROTATE_XOR (X1, X0, X3,  7);          \
+  ADD_ROTATE_XOR (X2, X1, X0,  9);          \
+  ADD_ROTATE_XOR (X3, X2, X1, 13);          \
+  ADD_ROTATE_XOR (X0, X3, X2, 18);          \
+                                            \
+  X1 = make_uint4 (X1.w, X1.x, X1.y, X1.z); \
+  X2 = make_uint4 (X2.z, X2.w, X2.x, X2.y); \
+  X3 = make_uint4 (X3.y, X3.z, X3.w, X3.x); \
+                                            \
+  ADD_ROTATE_XOR (X3, X0, X1,  7);          \
+  ADD_ROTATE_XOR (X2, X3, X0,  9);          \
+  ADD_ROTATE_XOR (X1, X2, X3, 13);          \
+  ADD_ROTATE_XOR (X0, X1, X2, 18);          \
+                                            \
+  X1 = make_uint4 (X1.y, X1.z, X1.w, X1.x); \
+  X2 = make_uint4 (X2.z, X2.w, X2.x, X2.y); \
+  X3 = make_uint4 (X3.w, X3.x, X3.y, X3.z); \
 }
+#else
+#define SALSA20_2R()                        \
+{                                           \
+  ADD_ROTATE_XOR (X1, X0, X3,  7);          \
+  ADD_ROTATE_XOR (X2, X1, X0,  9);          \
+  ADD_ROTATE_XOR (X3, X2, X1, 13);          \
+  ADD_ROTATE_XOR (X0, X3, X2, 18);          \
+                                            \
+  X1 = X1.s3012;                            \
+  X2 = X2.s2301;                            \
+  X3 = X3.s1230;                            \
+                                            \
+  ADD_ROTATE_XOR (X3, X0, X1,  7);          \
+  ADD_ROTATE_XOR (X2, X3, X0,  9);          \
+  ADD_ROTATE_XOR (X1, X2, X3, 13);          \
+  ADD_ROTATE_XOR (X0, X1, X2, 18);          \
+                                            \
+  X1 = X1.s1230;                            \
+  X2 = X2.s2301;                            \
+  X3 = X3.s3012;                            \
+}
+#endif
 
 #define SALSA20_8_XOR() \
 {                       \
@@ -163,10 +205,17 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
   #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #else
     T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
     T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
     T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
     T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #endif
 
     X[i + 0] = T[0];
     X[i + 1] = T[1];
@@ -203,10 +252,17 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
   #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #else
     T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
     T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
     T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
     T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #endif
 
     X[i + 0] = T[0];
     X[i + 1] = T[1];
@@ -272,8 +328,13 @@ KERNEL_FQ void m08900_init (KERN_ATTR_TMPS (scrypt_tmp_t))
     digest[6] = sha256_hmac_ctx2.opad.h[6];
     digest[7] = sha256_hmac_ctx2.opad.h[7];
 
+    #ifdef IS_CUDA
+    const uint4 tmp0 = make_uint4 (digest[0], digest[1], digest[2], digest[3]);
+    const uint4 tmp1 = make_uint4 (digest[4], digest[5], digest[6], digest[7]);
+    #else
     const uint4 tmp0 = (uint4) (digest[0], digest[1], digest[2], digest[3]);
     const uint4 tmp1 = (uint4) (digest[4], digest[5], digest[6], digest[7]);
+    #endif
 
     tmps[gid].P[k + 0] = tmp0;
     tmps[gid].P[k + 1] = tmp1;
@@ -286,10 +347,10 @@ KERNEL_FQ void m08900_loop (KERN_ATTR_TMPS (scrypt_tmp_t))
 
   if (gid >= gid_max) return;
 
-  GLOBAL_AS uint4 *d_scrypt0_buf = d_extra0_buf;
-  GLOBAL_AS uint4 *d_scrypt1_buf = d_extra1_buf;
-  GLOBAL_AS uint4 *d_scrypt2_buf = d_extra2_buf;
-  GLOBAL_AS uint4 *d_scrypt3_buf = d_extra3_buf;
+  GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf;
+  GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf;
+  GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf;
+  GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf;
 
   uint4 X[STATE_CNT4];
   uint4 T[STATE_CNT4];
@@ -348,31 +409,31 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
 
     tmp = tmps[gid].P[l + 0];
 
-    w0[0] = tmp.s0;
-    w0[1] = tmp.s1;
-    w0[2] = tmp.s2;
-    w0[3] = tmp.s3;
+    w0[0] = tmp.x;
+    w0[1] = tmp.y;
+    w0[2] = tmp.z;
+    w0[3] = tmp.w;
 
     tmp = tmps[gid].P[l + 1];
 
-    w1[0] = tmp.s0;
-    w1[1] = tmp.s1;
-    w1[2] = tmp.s2;
-    w1[3] = tmp.s3;
+    w1[0] = tmp.x;
+    w1[1] = tmp.y;
+    w1[2] = tmp.z;
+    w1[3] = tmp.w;
 
     tmp = tmps[gid].P[l + 2];
 
-    w2[0] = tmp.s0;
-    w2[1] = tmp.s1;
-    w2[2] = tmp.s2;
-    w2[3] = tmp.s3;
+    w2[0] = tmp.x;
+    w2[1] = tmp.y;
+    w2[2] = tmp.z;
+    w2[3] = tmp.w;
 
     tmp = tmps[gid].P[l + 3];
 
-    w3[0] = tmp.s0;
-    w3[1] = tmp.s1;
-    w3[2] = tmp.s2;
-    w3[3] = tmp.s3;
+    w3[0] = tmp.x;
+    w3[1] = tmp.y;
+    w3[2] = tmp.z;
+    w3[3] = tmp.w;
 
     sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
   }
diff --git a/OpenCL/m09000-pure.cl b/OpenCL/m09000-pure.cl
index e9bce68b7..591cdc0cf 100644
--- a/OpenCL/m09000-pure.cl
+++ b/OpenCL/m09000-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha1.cl"
 #endif
@@ -28,7 +29,7 @@ typedef struct pwsafe2_tmp
 
 // http://www.schneier.com/code/constants.txt
 
-CONSTANT_AS u32a c_sbox0[256] =
+CONSTANT_VK u32a c_sbox0[256] =
 {
   0xd1310ba6, 0x98dfb5ac, 0x2ffd72db, 0xd01adfb7,
   0xb8e1afed, 0x6a267e96, 0xba7c9045, 0xf12c7f99,
@@ -96,7 +97,7 @@ CONSTANT_AS u32a c_sbox0[256] =
   0x53b02d5d, 0xa99f8fa1, 0x08ba4799, 0x6e85076a
 };
 
-CONSTANT_AS u32a c_sbox1[256] =
+CONSTANT_VK u32a c_sbox1[256] =
 {
   0x4b7a70e9, 0xb5b32944, 0xdb75092e, 0xc4192623,
   0xad6ea6b0, 0x49a7df7d, 0x9cee60b8, 0x8fedb266,
@@ -164,7 +165,7 @@ CONSTANT_AS u32a c_sbox1[256] =
   0x153e21e7, 0x8fb03d4a, 0xe6e39f2b, 0xdb83adf7
 };
 
-CONSTANT_AS u32a c_sbox2[256] =
+CONSTANT_VK u32a c_sbox2[256] =
 {
   0xe93d5a68, 0x948140f7, 0xf64c261c, 0x94692934,
   0x411520f7, 0x7602d4f7, 0xbcf46b2e, 0xd4a20068,
@@ -232,7 +233,7 @@ CONSTANT_AS u32a c_sbox2[256] =
   0xd79a3234, 0x92638212, 0x670efa8e, 0x406000e0
 };
 
-CONSTANT_AS u32a c_sbox3[256] =
+CONSTANT_VK u32a c_sbox3[256] =
 {
   0x3a39ce37, 0xd3faf5cf, 0xabc27737, 0x5ac52d1b,
   0x5cb0679e, 0x4fa33742, 0xd3822740, 0x99bc9bbe,
@@ -300,7 +301,7 @@ CONSTANT_AS u32a c_sbox3[256] =
   0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6
 };
 
-CONSTANT_AS u32a c_pbox[18] =
+CONSTANT_VK u32a c_pbox[18] =
 {
   0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,
   0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
@@ -470,10 +471,10 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09
     P[i] = c_pbox[i];
   }
 
-  LOCAL_AS u32 S0_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S1_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S2_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S3_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256];
 
   LOCAL_AS u32 *S0 = S0_all[lid];
   LOCAL_AS u32 *S1 = S1_all[lid];
@@ -601,10 +602,10 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09
     P[i] = tmps[gid].P[i];
   }
 
-  LOCAL_AS u32 S0_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S1_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S2_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S3_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256];
 
   LOCAL_AS u32 *S0 = S0_all[lid];
   LOCAL_AS u32 *S1 = S1_all[lid];
diff --git a/OpenCL/m09100-pure.cl b/OpenCL/m09100-pure.cl
index a0a3cd7d4..8ccc65020 100644
--- a/OpenCL/m09100-pure.cl
+++ b/OpenCL/m09100-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -26,7 +27,7 @@ typedef struct lotus8_tmp
 
 } lotus8_tmp_t;
 
-CONSTANT_AS u32a lotus64_table[64] =
+CONSTANT_VK u32a lotus64_table[64] =
 {
   '0', '1', '2', '3', '4', '5', '6', '7',
   '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
@@ -38,7 +39,7 @@ CONSTANT_AS u32a lotus64_table[64] =
   'u', 'v', 'w', 'x', 'y', 'z', '+', '/',
 };
 
-CONSTANT_AS u32a lotus_magic_table[256] =
+CONSTANT_VK u32a lotus_magic_table[256] =
 {
   0xbd, 0x56, 0xea, 0xf2, 0xa2, 0xf1, 0xac, 0x2a,
   0xb0, 0x93, 0xd1, 0x9c, 0x1b, 0x33, 0xfd, 0xd0,
@@ -295,13 +296,6 @@ DECLSPEC void base64_encode (u8 *base64_hash, const u32 len, const u8 *base64_pl
 
 DECLSPEC void lotus6_base64_encode (u8 *base64_hash, const u32 salt0, const u32 salt1, const u32 a, const u32 b, const u32 c)
 {
-  const uchar4 salt0c = as_uchar4 (salt0);
-  const uchar4 salt1c = as_uchar4 (salt1);
-
-  const uchar4 ac = as_uchar4 (a);
-  const uchar4 bc = as_uchar4 (b);
-  const uchar4 cc = as_uchar4 (c);
-
   u8 tmp[24]; // size 22 (=pw_len) is needed but base64 needs size divisible by 4
 
   /*
@@ -310,23 +304,23 @@ DECLSPEC void lotus6_base64_encode (u8 *base64_hash, const u32 salt0, const u32
 
   u8 base64_plain[16];
 
-  base64_plain[ 0] = salt0c.s0;
-  base64_plain[ 1] = salt0c.s1;
-  base64_plain[ 2] = salt0c.s2;
-  base64_plain[ 3] = salt0c.s3;
+  base64_plain[ 0] = unpack_v8a_from_v32_S (salt0);
+  base64_plain[ 1] = unpack_v8b_from_v32_S (salt0);
+  base64_plain[ 2] = unpack_v8c_from_v32_S (salt0);
+  base64_plain[ 3] = unpack_v8d_from_v32_S (salt0);
   base64_plain[ 3] -= -4; // dont ask!
-  base64_plain[ 4] = salt1c.s0;
-  base64_plain[ 5] = ac.s0;
-  base64_plain[ 6] = ac.s1;
-  base64_plain[ 7] = ac.s2;
-  base64_plain[ 8] = ac.s3;
-  base64_plain[ 9] = bc.s0;
-  base64_plain[10] = bc.s1;
-  base64_plain[11] = bc.s2;
-  base64_plain[12] = bc.s3;
-  base64_plain[13] = cc.s0;
-  base64_plain[14] = cc.s1;
-  base64_plain[15] = cc.s2;
+  base64_plain[ 4] = unpack_v8a_from_v32_S (salt1);
+  base64_plain[ 5] = unpack_v8a_from_v32_S (a);
+  base64_plain[ 6] = unpack_v8b_from_v32_S (a);
+  base64_plain[ 7] = unpack_v8c_from_v32_S (a);
+  base64_plain[ 8] = unpack_v8d_from_v32_S (a);
+  base64_plain[ 9] = unpack_v8a_from_v32_S (b);
+  base64_plain[10] = unpack_v8b_from_v32_S (b);
+  base64_plain[11] = unpack_v8c_from_v32_S (b);
+  base64_plain[12] = unpack_v8d_from_v32_S (b);
+  base64_plain[13] = unpack_v8a_from_v32_S (c);
+  base64_plain[14] = unpack_v8b_from_v32_S (c);
+  base64_plain[15] = unpack_v8c_from_v32_S (c);
 
   /*
    * base64 encode the $salt.$digest string
@@ -408,14 +402,14 @@ KERNEL_FQ void m09100_init (KERN_ATTR_TMPS (lotus8_tmp_t))
    * sbox
    */
 
-  LOCAL_AS u32 s_lotus_magic_table[256];
+  LOCAL_VK u32 s_lotus_magic_table[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_lotus_magic_table[i] = lotus_magic_table[i];
   }
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -426,7 +420,7 @@ KERNEL_FQ void m09100_init (KERN_ATTR_TMPS (lotus8_tmp_t))
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m09400-pure.cl b/OpenCL/m09400-pure.cl
index 99747349f..2ee3a6844 100644
--- a/OpenCL/m09400-pure.cl
+++ b/OpenCL/m09400-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -138,17 +139,17 @@ KERNEL_FQ void m09400_comp (KERN_ATTR_TMPS_ESALT (office2007_tmp_t, office2007_t
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -165,7 +166,7 @@ KERNEL_FQ void m09400_comp (KERN_ATTR_TMPS_ESALT (office2007_tmp_t, office2007_t
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m09500-pure.cl b/OpenCL/m09500-pure.cl
index ddcbc5cb9..b2ebe70d9 100644
--- a/OpenCL/m09500-pure.cl
+++ b/OpenCL/m09500-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -136,17 +137,17 @@ KERNEL_FQ void m09500_comp (KERN_ATTR_TMPS_ESALT (office2010_tmp_t, office2010_t
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -163,7 +164,7 @@ KERNEL_FQ void m09500_comp (KERN_ATTR_TMPS_ESALT (office2010_tmp_t, office2010_t
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m09600-pure.cl b/OpenCL/m09600-pure.cl
index c4d0332ee..dd1520040 100644
--- a/OpenCL/m09600-pure.cl
+++ b/OpenCL/m09600-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -182,17 +183,17 @@ KERNEL_FQ void m09600_comp (KERN_ATTR_TMPS_ESALT (office2013_tmp_t, office2013_t
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -209,7 +210,7 @@ KERNEL_FQ void m09600_comp (KERN_ATTR_TMPS_ESALT (office2013_tmp_t, office2013_t
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m09700_a0-optimized.cl b/OpenCL/m09700_a0-optimized.cl
index 7a5adff36..dbbbf4b55 100644
--- a/OpenCL/m09700_a0-optimized.cl
+++ b/OpenCL/m09700_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -526,7 +527,7 @@ KERNEL_FQ void m09700_m04 (KERN_ATTR_RULES_ESALT (oldoffice01_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -713,7 +714,7 @@ KERNEL_FQ void m09700_s04 (KERN_ATTR_RULES_ESALT (oldoffice01_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m09700_a1-optimized.cl b/OpenCL/m09700_a1-optimized.cl
index cfef5f69f..1c3a59809 100644
--- a/OpenCL/m09700_a1-optimized.cl
+++ b/OpenCL/m09700_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -524,7 +525,7 @@ KERNEL_FQ void m09700_m04 (KERN_ATTR_ESALT (oldoffice01_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -769,7 +770,7 @@ KERNEL_FQ void m09700_s04 (KERN_ATTR_ESALT (oldoffice01_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m09700_a3-optimized.cl b/OpenCL/m09700_a3-optimized.cl
index 1d75f0d44..032b6f5eb 100644
--- a/OpenCL/m09700_a3-optimized.cl
+++ b/OpenCL/m09700_a3-optimized.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -1011,7 +1012,7 @@ KERNEL_FQ void m09700_m04 (KERN_ATTR_ESALT (oldoffice01_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09700m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -1060,7 +1061,7 @@ KERNEL_FQ void m09700_m08 (KERN_ATTR_ESALT (oldoffice01_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09700m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -1109,7 +1110,7 @@ KERNEL_FQ void m09700_m16 (KERN_ATTR_ESALT (oldoffice01_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09700m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -1158,7 +1159,7 @@ KERNEL_FQ void m09700_s04 (KERN_ATTR_ESALT (oldoffice01_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09700s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -1207,7 +1208,7 @@ KERNEL_FQ void m09700_s08 (KERN_ATTR_ESALT (oldoffice01_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09700s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -1256,7 +1257,7 @@ KERNEL_FQ void m09700_s16 (KERN_ATTR_ESALT (oldoffice01_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09700s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
diff --git a/OpenCL/m09710_a0-optimized.cl b/OpenCL/m09710_a0-optimized.cl
index 6caaee208..4359393f5 100644
--- a/OpenCL/m09710_a0-optimized.cl
+++ b/OpenCL/m09710_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -182,7 +183,7 @@ KERNEL_FQ void m09710_m04 (KERN_ATTR_RULES_ESALT (oldoffice01_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -320,7 +321,7 @@ KERNEL_FQ void m09710_s04 (KERN_ATTR_RULES_ESALT (oldoffice01_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m09710_a1-optimized.cl b/OpenCL/m09710_a1-optimized.cl
index 7d55a52a1..fa905ed7c 100644
--- a/OpenCL/m09710_a1-optimized.cl
+++ b/OpenCL/m09710_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -180,7 +181,7 @@ KERNEL_FQ void m09710_m04 (KERN_ATTR_ESALT (oldoffice01_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -364,7 +365,7 @@ KERNEL_FQ void m09710_s04 (KERN_ATTR_ESALT (oldoffice01_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m09710_a3-optimized.cl b/OpenCL/m09710_a3-optimized.cl
index bc3b3127a..ae9bd0383 100644
--- a/OpenCL/m09710_a3-optimized.cl
+++ b/OpenCL/m09710_a3-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -420,7 +421,7 @@ KERNEL_FQ void m09710_m04 (KERN_ATTR_ESALT (oldoffice01_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09710m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -477,7 +478,7 @@ KERNEL_FQ void m09710_s04 (KERN_ATTR_ESALT (oldoffice01_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09710s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
diff --git a/OpenCL/m09720_a0-optimized.cl b/OpenCL/m09720_a0-optimized.cl
index f304a0dfb..e661620af 100644
--- a/OpenCL/m09720_a0-optimized.cl
+++ b/OpenCL/m09720_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m09720_a1-optimized.cl b/OpenCL/m09720_a1-optimized.cl
index a1a58e9a5..d1eabb20c 100644
--- a/OpenCL/m09720_a1-optimized.cl
+++ b/OpenCL/m09720_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m09720_a3-optimized.cl b/OpenCL/m09720_a3-optimized.cl
index 57c39df18..1a8663d33 100644
--- a/OpenCL/m09720_a3-optimized.cl
+++ b/OpenCL/m09720_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m09800_a0-optimized.cl b/OpenCL/m09800_a0-optimized.cl
index 0ab96faab..ffa5c1388 100644
--- a/OpenCL/m09800_a0-optimized.cl
+++ b/OpenCL/m09800_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -182,7 +183,7 @@ KERNEL_FQ void m09800_m04 (KERN_ATTR_RULES_ESALT (oldoffice34_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -382,7 +383,7 @@ KERNEL_FQ void m09800_s04 (KERN_ATTR_RULES_ESALT (oldoffice34_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m09800_a1-optimized.cl b/OpenCL/m09800_a1-optimized.cl
index 278684d0a..07bfb15ca 100644
--- a/OpenCL/m09800_a1-optimized.cl
+++ b/OpenCL/m09800_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -180,7 +181,7 @@ KERNEL_FQ void m09800_m04 (KERN_ATTR_ESALT (oldoffice34_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -430,7 +431,7 @@ KERNEL_FQ void m09800_s04 (KERN_ATTR_ESALT (oldoffice34_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m09800_a3-optimized.cl b/OpenCL/m09800_a3-optimized.cl
index 1ab9a6d9a..303b177a8 100644
--- a/OpenCL/m09800_a3-optimized.cl
+++ b/OpenCL/m09800_a3-optimized.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -535,7 +536,7 @@ KERNEL_FQ void m09800_m04 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09800m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -584,7 +585,7 @@ KERNEL_FQ void m09800_m08 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09800m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -633,7 +634,7 @@ KERNEL_FQ void m09800_m16 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09800m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -682,7 +683,7 @@ KERNEL_FQ void m09800_s04 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09800s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -731,7 +732,7 @@ KERNEL_FQ void m09800_s08 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09800s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -780,7 +781,7 @@ KERNEL_FQ void m09800_s16 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09800s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
diff --git a/OpenCL/m09810_a0-optimized.cl b/OpenCL/m09810_a0-optimized.cl
index 69e637264..5db2264a6 100644
--- a/OpenCL/m09810_a0-optimized.cl
+++ b/OpenCL/m09810_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -182,7 +183,7 @@ KERNEL_FQ void m09810_m04 (KERN_ATTR_RULES_ESALT (oldoffice34_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -307,7 +308,7 @@ KERNEL_FQ void m09810_s04 (KERN_ATTR_RULES_ESALT (oldoffice34_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m09810_a1-optimized.cl b/OpenCL/m09810_a1-optimized.cl
index 685395dad..69ecab5c3 100644
--- a/OpenCL/m09810_a1-optimized.cl
+++ b/OpenCL/m09810_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -180,7 +181,7 @@ KERNEL_FQ void m09810_m04 (KERN_ATTR_ESALT (oldoffice34_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -351,7 +352,7 @@ KERNEL_FQ void m09810_s04 (KERN_ATTR_ESALT (oldoffice34_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m09810_a3-optimized.cl b/OpenCL/m09810_a3-optimized.cl
index 0eff00807..495adc734 100644
--- a/OpenCL/m09810_a3-optimized.cl
+++ b/OpenCL/m09810_a3-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -390,7 +391,7 @@ KERNEL_FQ void m09810_m04 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09810m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -439,7 +440,7 @@ KERNEL_FQ void m09810_m08 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09810m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -488,7 +489,7 @@ KERNEL_FQ void m09810_m16 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09810m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -537,7 +538,7 @@ KERNEL_FQ void m09810_s04 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09810s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -586,7 +587,7 @@ KERNEL_FQ void m09810_s08 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09810s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -635,7 +636,7 @@ KERNEL_FQ void m09810_s16 (KERN_ATTR_ESALT (oldoffice34_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m09810s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
diff --git a/OpenCL/m09820_a0-optimized.cl b/OpenCL/m09820_a0-optimized.cl
index cfb6bdf6c..0e9a95ca6 100644
--- a/OpenCL/m09820_a0-optimized.cl
+++ b/OpenCL/m09820_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m09820_a1-optimized.cl b/OpenCL/m09820_a1-optimized.cl
index 584491816..028b3d28f 100644
--- a/OpenCL/m09820_a1-optimized.cl
+++ b/OpenCL/m09820_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m09820_a3-optimized.cl b/OpenCL/m09820_a3-optimized.cl
index c17b11e46..cde918d0e 100644
--- a/OpenCL/m09820_a3-optimized.cl
+++ b/OpenCL/m09820_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m09900_a0-optimized.cl b/OpenCL/m09900_a0-optimized.cl
index 31db1fa4b..3c4cf10ff 100644
--- a/OpenCL/m09900_a0-optimized.cl
+++ b/OpenCL/m09900_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m09900_a0-pure.cl b/OpenCL/m09900_a0-pure.cl
index 64976e835..04347dfc9 100644
--- a/OpenCL/m09900_a0-pure.cl
+++ b/OpenCL/m09900_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m09900_a1-optimized.cl b/OpenCL/m09900_a1-optimized.cl
index bdac4c177..07f62d547 100644
--- a/OpenCL/m09900_a1-optimized.cl
+++ b/OpenCL/m09900_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m09900_a1-pure.cl b/OpenCL/m09900_a1-pure.cl
index 2313b9bf9..367aa5755 100644
--- a/OpenCL/m09900_a1-pure.cl
+++ b/OpenCL/m09900_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m09900_a3-optimized.cl b/OpenCL/m09900_a3-optimized.cl
index 6f56f1703..e0145b106 100644
--- a/OpenCL/m09900_a3-optimized.cl
+++ b/OpenCL/m09900_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m09900_a3-pure.cl b/OpenCL/m09900_a3-pure.cl
index 4cad825f4..df6e1f55f 100644
--- a/OpenCL/m09900_a3-pure.cl
+++ b/OpenCL/m09900_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m10100_a0-optimized.cl b/OpenCL/m10100_a0-optimized.cl
index 8ed585765..593fb98fe 100644
--- a/OpenCL/m10100_a0-optimized.cl
+++ b/OpenCL/m10100_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m10100_a1-optimized.cl b/OpenCL/m10100_a1-optimized.cl
index 61d40e2b3..b18e584ca 100644
--- a/OpenCL/m10100_a1-optimized.cl
+++ b/OpenCL/m10100_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
diff --git a/OpenCL/m10100_a3-optimized.cl b/OpenCL/m10100_a3-optimized.cl
index e45ac8f51..c3269014b 100644
--- a/OpenCL/m10100_a3-optimized.cl
+++ b/OpenCL/m10100_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
diff --git a/OpenCL/m10300-pure.cl b/OpenCL/m10300-pure.cl
index 6c3b254ee..dc9b31e26 100644
--- a/OpenCL/m10300-pure.cl
+++ b/OpenCL/m10300-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha1.cl"
 #endif
diff --git a/OpenCL/m10400_a0-optimized.cl b/OpenCL/m10400_a0-optimized.cl
index b91746073..55332396a 100644
--- a/OpenCL/m10400_a0-optimized.cl
+++ b/OpenCL/m10400_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,7 +17,7 @@
 #include "inc_hash_md5.cl"
 #endif
 
-CONSTANT_AS u32a padding[8] =
+CONSTANT_VK u32a padding[8] =
 {
   0x5e4ebf28,
   0x418a754e,
@@ -191,7 +192,7 @@ KERNEL_FQ void m10400_m04 (KERN_ATTR_RULES_ESALT (pdf_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -367,7 +368,7 @@ KERNEL_FQ void m10400_s04 (KERN_ATTR_RULES_ESALT (pdf_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m10400_a1-optimized.cl b/OpenCL/m10400_a1-optimized.cl
index 22e1a116f..c10d184ca 100644
--- a/OpenCL/m10400_a1-optimized.cl
+++ b/OpenCL/m10400_a1-optimized.cl
@@ -9,12 +9,13 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
-CONSTANT_AS u32a padding[8] =
+CONSTANT_VK u32a padding[8] =
 {
   0x5e4ebf28,
   0x418a754e,
@@ -189,7 +190,7 @@ KERNEL_FQ void m10400_m04 (KERN_ATTR_ESALT (pdf_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -425,7 +426,7 @@ KERNEL_FQ void m10400_s04 (KERN_ATTR_ESALT (pdf_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m10400_a3-optimized.cl b/OpenCL/m10400_a3-optimized.cl
index eb6020fc4..49eaac162 100644
--- a/OpenCL/m10400_a3-optimized.cl
+++ b/OpenCL/m10400_a3-optimized.cl
@@ -9,12 +9,13 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
-CONSTANT_AS u32a padding[8] =
+CONSTANT_VK u32a padding[8] =
 {
   0x5e4ebf28,
   0x418a754e,
@@ -543,7 +544,7 @@ KERNEL_FQ void m10400_m04 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10400m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -592,7 +593,7 @@ KERNEL_FQ void m10400_m08 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10400m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -641,7 +642,7 @@ KERNEL_FQ void m10400_m16 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10400m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -690,7 +691,7 @@ KERNEL_FQ void m10400_s04 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10400s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -739,7 +740,7 @@ KERNEL_FQ void m10400_s08 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10400s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -788,7 +789,7 @@ KERNEL_FQ void m10400_s16 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10400s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
diff --git a/OpenCL/m10410_a0-optimized.cl b/OpenCL/m10410_a0-optimized.cl
index 5f3212a29..d91c72621 100644
--- a/OpenCL/m10410_a0-optimized.cl
+++ b/OpenCL/m10410_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,7 +17,7 @@
 #include "inc_hash_md5.cl"
 #endif
 
-CONSTANT_AS u32a padding[8] =
+CONSTANT_VK u32a padding[8] =
 {
   0x5e4ebf28,
   0x418a754e,
@@ -191,7 +192,7 @@ KERNEL_FQ void m10410_m04 (KERN_ATTR_RULES_ESALT (pdf_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
   /**
@@ -263,7 +264,7 @@ KERNEL_FQ void m10410_s04 (KERN_ATTR_RULES_ESALT (pdf_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
   /**
diff --git a/OpenCL/m10410_a1-optimized.cl b/OpenCL/m10410_a1-optimized.cl
index b0229a349..8e9c1fef6 100644
--- a/OpenCL/m10410_a1-optimized.cl
+++ b/OpenCL/m10410_a1-optimized.cl
@@ -9,12 +9,13 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
-CONSTANT_AS u32a padding[8] =
+CONSTANT_VK u32a padding[8] =
 {
   0x5e4ebf28,
   0x418a754e,
@@ -189,7 +190,7 @@ KERNEL_FQ void m10410_m04 (KERN_ATTR_ESALT (pdf_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -331,7 +332,7 @@ KERNEL_FQ void m10410_s04 (KERN_ATTR_ESALT (pdf_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m10410_a3-optimized.cl b/OpenCL/m10410_a3-optimized.cl
index 399672cb3..cef243a5c 100644
--- a/OpenCL/m10410_a3-optimized.cl
+++ b/OpenCL/m10410_a3-optimized.cl
@@ -9,12 +9,13 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
-CONSTANT_AS u32a padding[8] =
+CONSTANT_VK u32a padding[8] =
 {
   0x5e4ebf28,
   0x418a754e,
@@ -297,7 +298,7 @@ KERNEL_FQ void m10410_m04 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10410m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -346,7 +347,7 @@ KERNEL_FQ void m10410_m08 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10410m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -395,7 +396,7 @@ KERNEL_FQ void m10410_m16 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10410m (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -444,7 +445,7 @@ KERNEL_FQ void m10410_s04 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10410s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -493,7 +494,7 @@ KERNEL_FQ void m10410_s08 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10410s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
@@ -542,7 +543,7 @@ KERNEL_FQ void m10410_s16 (KERN_ATTR_ESALT (pdf_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   m10410s (rc4_keys, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max);
 }
diff --git a/OpenCL/m10420_a0-optimized.cl b/OpenCL/m10420_a0-optimized.cl
index d02749446..799be19d9 100644
--- a/OpenCL/m10420_a0-optimized.cl
+++ b/OpenCL/m10420_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -15,7 +16,7 @@
 #include "inc_hash_md5.cl"
 #endif
 
-CONSTANT_AS u32a padding[8] =
+CONSTANT_VK u32a padding[8] =
 {
   0x5e4ebf28,
   0x418a754e,
diff --git a/OpenCL/m10420_a1-optimized.cl b/OpenCL/m10420_a1-optimized.cl
index cbde1f143..9f9f26a7f 100644
--- a/OpenCL/m10420_a1-optimized.cl
+++ b/OpenCL/m10420_a1-optimized.cl
@@ -8,12 +8,13 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
-CONSTANT_AS u32a padding[8] =
+CONSTANT_VK u32a padding[8] =
 {
   0x5e4ebf28,
   0x418a754e,
diff --git a/OpenCL/m10420_a3-optimized.cl b/OpenCL/m10420_a3-optimized.cl
index 815e59a15..7162048c0 100644
--- a/OpenCL/m10420_a3-optimized.cl
+++ b/OpenCL/m10420_a3-optimized.cl
@@ -8,12 +8,13 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
-CONSTANT_AS u32a padding[8] =
+CONSTANT_VK u32a padding[8] =
 {
   0x5e4ebf28,
   0x418a754e,
diff --git a/OpenCL/m10500-pure.cl b/OpenCL/m10500-pure.cl
index 3ba4e85a4..51ee73127 100644
--- a/OpenCL/m10500-pure.cl
+++ b/OpenCL/m10500-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md5.cl"
 #endif
@@ -13,7 +14,7 @@
 #define COMPARE_S "inc_comp_single.cl"
 #define COMPARE_M "inc_comp_multi.cl"
 
-CONSTANT_AS u32a padding[8] =
+CONSTANT_VK u32a padding[8] =
 {
   0x5e4ebf28,
   0x418a754e,
@@ -376,7 +377,7 @@ KERNEL_FQ void m10500_loop (KERN_ATTR_TMPS_ESALT (pdf14_tmp_t, pdf_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m10700-optimized.cl b/OpenCL/m10700-optimized.cl
index 290761db2..a44924c28 100644
--- a/OpenCL/m10700-optimized.cl
+++ b/OpenCL/m10700-optimized.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha256.cl"
 #include "inc_hash_sha384.cl"
@@ -593,11 +594,11 @@ KERNEL_FQ void m10700_loop (KERN_ATTR_TMPS_ESALT (pdf17l8_tmp_t, pdf_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -608,7 +609,7 @@ KERNEL_FQ void m10700_loop (KERN_ATTR_TMPS_ESALT (pdf17l8_tmp_t, pdf_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m10700-pure.cl b/OpenCL/m10700-pure.cl
index 6b9dfc686..f43c024cf 100644
--- a/OpenCL/m10700-pure.cl
+++ b/OpenCL/m10700-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha256.cl"
 #include "inc_hash_sha384.cl"
@@ -1194,11 +1195,11 @@ KERNEL_FQ void m10700_loop (KERN_ATTR_TMPS_ESALT (pdf17l8_tmp_t, pdf_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1209,7 +1210,7 @@ KERNEL_FQ void m10700_loop (KERN_ATTR_TMPS_ESALT (pdf17l8_tmp_t, pdf_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m10800_a0-optimized.cl b/OpenCL/m10800_a0-optimized.cl
index aaf1a8bd1..396b389a6 100644
--- a/OpenCL/m10800_a0-optimized.cl
+++ b/OpenCL/m10800_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -85,6 +86,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -92,6 +99,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m10800_a0-pure.cl b/OpenCL/m10800_a0-pure.cl
index 023bb9b50..ec46261db 100644
--- a/OpenCL/m10800_a0-pure.cl
+++ b/OpenCL/m10800_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m10800_a1-optimized.cl b/OpenCL/m10800_a1-optimized.cl
index 9bc1b42bd..11aa95dbd 100644
--- a/OpenCL/m10800_a1-optimized.cl
+++ b/OpenCL/m10800_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha384.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m10800_a1-pure.cl b/OpenCL/m10800_a1-pure.cl
index 5a8a1c2a2..0f5b7abeb 100644
--- a/OpenCL/m10800_a1-pure.cl
+++ b/OpenCL/m10800_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha384.cl"
diff --git a/OpenCL/m10800_a3-optimized.cl b/OpenCL/m10800_a3-optimized.cl
index e2291000c..cef22d51f 100644
--- a/OpenCL/m10800_a3-optimized.cl
+++ b/OpenCL/m10800_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha384.cl"
@@ -83,6 +84,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -90,6 +97,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m10800_a3-pure.cl b/OpenCL/m10800_a3-pure.cl
index 940e5a61c..8c2fe7138 100644
--- a/OpenCL/m10800_a3-pure.cl
+++ b/OpenCL/m10800_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha384.cl"
diff --git a/OpenCL/m10900-pure.cl b/OpenCL/m10900-pure.cl
index 19613c2ca..b2fb8bacc 100644
--- a/OpenCL/m10900-pure.cl
+++ b/OpenCL/m10900-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m11000_a0-optimized.cl b/OpenCL/m11000_a0-optimized.cl
index ba14d4fd0..b569c506b 100644
--- a/OpenCL/m11000_a0-optimized.cl
+++ b/OpenCL/m11000_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m11000_a0-pure.cl b/OpenCL/m11000_a0-pure.cl
index 89af491eb..9e44e7173 100644
--- a/OpenCL/m11000_a0-pure.cl
+++ b/OpenCL/m11000_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m11000_a1-optimized.cl b/OpenCL/m11000_a1-optimized.cl
index e0672c0be..d41899d96 100644
--- a/OpenCL/m11000_a1-optimized.cl
+++ b/OpenCL/m11000_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m11000_a1-pure.cl b/OpenCL/m11000_a1-pure.cl
index e5c5edb2e..c51d23374 100644
--- a/OpenCL/m11000_a1-pure.cl
+++ b/OpenCL/m11000_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m11000_a3-optimized.cl b/OpenCL/m11000_a3-optimized.cl
index 9ac1f844d..357af7a59 100644
--- a/OpenCL/m11000_a3-optimized.cl
+++ b/OpenCL/m11000_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m11000_a3-pure.cl b/OpenCL/m11000_a3-pure.cl
index 63c36625c..00f90f8b7 100644
--- a/OpenCL/m11000_a3-pure.cl
+++ b/OpenCL/m11000_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m11100_a0-optimized.cl b/OpenCL/m11100_a0-optimized.cl
index 7b2319f41..c0e12f986 100644
--- a/OpenCL/m11100_a0-optimized.cl
+++ b/OpenCL/m11100_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m11100_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m11100_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m11100_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -375,7 +376,7 @@ KERNEL_FQ void m11100_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -386,7 +387,7 @@ KERNEL_FQ void m11100_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11100_a0-pure.cl b/OpenCL/m11100_a0-pure.cl
index 821f39c07..c01fe3983 100644
--- a/OpenCL/m11100_a0-pure.cl
+++ b/OpenCL/m11100_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m11100_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m11100_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m11100_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -187,7 +188,7 @@ KERNEL_FQ void m11100_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -198,7 +199,7 @@ KERNEL_FQ void m11100_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11100_a1-optimized.cl b/OpenCL/m11100_a1-optimized.cl
index d378ff025..1ee64d6d7 100644
--- a/OpenCL/m11100_a1-optimized.cl
+++ b/OpenCL/m11100_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m11100_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m11100_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m11100_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -433,7 +434,7 @@ KERNEL_FQ void m11100_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -444,7 +445,7 @@ KERNEL_FQ void m11100_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11100_a1-pure.cl b/OpenCL/m11100_a1-pure.cl
index 92bdbac02..29b085050 100644
--- a/OpenCL/m11100_a1-pure.cl
+++ b/OpenCL/m11100_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m11100_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m11100_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m11100_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -183,7 +184,7 @@ KERNEL_FQ void m11100_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -194,7 +195,7 @@ KERNEL_FQ void m11100_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11100_a3-optimized.cl b/OpenCL/m11100_a3-optimized.cl
index 76f4dab57..58a49c9aa 100644
--- a/OpenCL/m11100_a3-optimized.cl
+++ b/OpenCL/m11100_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m11100m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -664,7 +665,7 @@ KERNEL_FQ void m11100_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -675,7 +676,7 @@ KERNEL_FQ void m11100_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -734,7 +735,7 @@ KERNEL_FQ void m11100_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -745,7 +746,7 @@ KERNEL_FQ void m11100_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -804,7 +805,7 @@ KERNEL_FQ void m11100_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -815,7 +816,7 @@ KERNEL_FQ void m11100_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -874,7 +875,7 @@ KERNEL_FQ void m11100_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -885,7 +886,7 @@ KERNEL_FQ void m11100_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -944,7 +945,7 @@ KERNEL_FQ void m11100_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -955,7 +956,7 @@ KERNEL_FQ void m11100_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1014,7 +1015,7 @@ KERNEL_FQ void m11100_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1025,7 +1026,7 @@ KERNEL_FQ void m11100_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11100_a3-pure.cl b/OpenCL/m11100_a3-pure.cl
index a9e172740..de5089ebc 100644
--- a/OpenCL/m11100_a3-pure.cl
+++ b/OpenCL/m11100_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m11100_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m11100_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m11100_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -226,7 +227,7 @@ KERNEL_FQ void m11100_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -237,7 +238,7 @@ KERNEL_FQ void m11100_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11200_a0-optimized.cl b/OpenCL/m11200_a0-optimized.cl
index 2081da572..f3f09585a 100644
--- a/OpenCL/m11200_a0-optimized.cl
+++ b/OpenCL/m11200_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m11200_a0-pure.cl b/OpenCL/m11200_a0-pure.cl
index 71e247607..c89b16362 100644
--- a/OpenCL/m11200_a0-pure.cl
+++ b/OpenCL/m11200_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m11200_a1-optimized.cl b/OpenCL/m11200_a1-optimized.cl
index 0e6e3e2cd..62dda9028 100644
--- a/OpenCL/m11200_a1-optimized.cl
+++ b/OpenCL/m11200_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m11200_a1-pure.cl b/OpenCL/m11200_a1-pure.cl
index 12ac6b1ff..173c3a127 100644
--- a/OpenCL/m11200_a1-pure.cl
+++ b/OpenCL/m11200_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m11200_a3-optimized.cl b/OpenCL/m11200_a3-optimized.cl
index 34d62f10c..7f012eaa7 100644
--- a/OpenCL/m11200_a3-optimized.cl
+++ b/OpenCL/m11200_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m11200_a3-pure.cl b/OpenCL/m11200_a3-pure.cl
index 7d9a898b8..76bb6a866 100644
--- a/OpenCL/m11200_a3-pure.cl
+++ b/OpenCL/m11200_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m11200_mxx (KERN_ATTR_VECTOR ())
diff --git a/OpenCL/m11300-pure.cl b/OpenCL/m11300-pure.cl
index cf0b06e7b..c734b102c 100644
--- a/OpenCL/m11300-pure.cl
+++ b/OpenCL/m11300-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -219,17 +220,17 @@ KERNEL_FQ void m11300_comp (KERN_ATTR_TMPS_ESALT (bitcoin_wallet_tmp_t, bitcoin_
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -246,7 +247,7 @@ KERNEL_FQ void m11300_comp (KERN_ATTR_TMPS_ESALT (bitcoin_wallet_tmp_t, bitcoin_
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11400_a0-pure.cl b/OpenCL/m11400_a0-pure.cl
index 28d917d9e..ef1074fe1 100644
--- a/OpenCL/m11400_a0-pure.cl
+++ b/OpenCL/m11400_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -26,15 +27,15 @@ typedef struct sip
 } sip_t;
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m11400_mxx (KERN_ATTR_RULES_ESALT (sip_t))
@@ -51,7 +52,7 @@ KERNEL_FQ void m11400_mxx (KERN_ATTR_RULES_ESALT (sip_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -62,7 +63,7 @@ KERNEL_FQ void m11400_mxx (KERN_ATTR_RULES_ESALT (sip_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -149,7 +150,7 @@ KERNEL_FQ void m11400_sxx (KERN_ATTR_RULES_ESALT (sip_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -160,7 +161,7 @@ KERNEL_FQ void m11400_sxx (KERN_ATTR_RULES_ESALT (sip_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11400_a1-pure.cl b/OpenCL/m11400_a1-pure.cl
index 92ac1c8fd..85cdc5ee6 100644
--- a/OpenCL/m11400_a1-pure.cl
+++ b/OpenCL/m11400_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
@@ -24,15 +25,15 @@ typedef struct sip
 } sip_t;
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m11400_mxx (KERN_ATTR_ESALT (sip_t))
@@ -49,7 +50,7 @@ KERNEL_FQ void m11400_mxx (KERN_ATTR_ESALT (sip_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -60,7 +61,7 @@ KERNEL_FQ void m11400_mxx (KERN_ATTR_ESALT (sip_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -143,7 +144,7 @@ KERNEL_FQ void m11400_sxx (KERN_ATTR_ESALT (sip_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -154,7 +155,7 @@ KERNEL_FQ void m11400_sxx (KERN_ATTR_ESALT (sip_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11400_a3-pure.cl b/OpenCL/m11400_a3-pure.cl
index a98a3f6ce..b933c43d9 100644
--- a/OpenCL/m11400_a3-pure.cl
+++ b/OpenCL/m11400_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -24,15 +25,15 @@ typedef struct sip
 } sip_t;
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m11400_mxx (KERN_ATTR_VECTOR_ESALT (sip_t))
@@ -49,7 +50,7 @@ KERNEL_FQ void m11400_mxx (KERN_ATTR_VECTOR_ESALT (sip_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -60,7 +61,7 @@ KERNEL_FQ void m11400_mxx (KERN_ATTR_VECTOR_ESALT (sip_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -169,7 +170,7 @@ KERNEL_FQ void m11400_sxx (KERN_ATTR_VECTOR_ESALT (sip_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -180,7 +181,7 @@ KERNEL_FQ void m11400_sxx (KERN_ATTR_VECTOR_ESALT (sip_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11500_a0-optimized.cl b/OpenCL/m11500_a0-optimized.cl
index bcea197f0..1ea2ec7e4 100644
--- a/OpenCL/m11500_a0-optimized.cl
+++ b/OpenCL/m11500_a0-optimized.cl
@@ -9,13 +9,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a crc32tab[0x100] =
+CONSTANT_VK u32a crc32tab[0x100] =
 {
   0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
   0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
@@ -90,15 +91,15 @@ DECLSPEC u32x round_crc32 (u32x a, const u32x v)
   const u32x s = a >> 8;
 
   #if   VECT_SIZE == 1
-  a = (u32x) crc32tab[k];
+  a = make_u32x crc32tab[k];
   #elif VECT_SIZE == 2
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1]);
   #elif VECT_SIZE == 4
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3]);
   #elif VECT_SIZE == 8
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7]);
   #elif VECT_SIZE == 16
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7], crc32tab[k.s8], crc32tab[k.s9], crc32tab[k.sa], crc32tab[k.sb], crc32tab[k.sc], crc32tab[k.sd], crc32tab[k.se], crc32tab[k.sf]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7], crc32tab[k.s8], crc32tab[k.s9], crc32tab[k.sa], crc32tab[k.sb], crc32tab[k.sc], crc32tab[k.sd], crc32tab[k.se], crc32tab[k.sf]);
   #endif
 
   a ^= s;
diff --git a/OpenCL/m11500_a1-optimized.cl b/OpenCL/m11500_a1-optimized.cl
index bede0d4f1..b3390f961 100644
--- a/OpenCL/m11500_a1-optimized.cl
+++ b/OpenCL/m11500_a1-optimized.cl
@@ -9,11 +9,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a crc32tab[0x100] =
+CONSTANT_VK u32a crc32tab[0x100] =
 {
   0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
   0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
@@ -88,15 +89,15 @@ DECLSPEC u32x round_crc32 (u32x a, const u32x v)
   const u32x s = a >> 8;
 
   #if   VECT_SIZE == 1
-  a = (u32x) crc32tab[k];
+  a = make_u32x crc32tab[k];
   #elif VECT_SIZE == 2
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1]);
   #elif VECT_SIZE == 4
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3]);
   #elif VECT_SIZE == 8
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7]);
   #elif VECT_SIZE == 16
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7], crc32tab[k.s8], crc32tab[k.s9], crc32tab[k.sa], crc32tab[k.sb], crc32tab[k.sc], crc32tab[k.sd], crc32tab[k.se], crc32tab[k.sf]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7], crc32tab[k.s8], crc32tab[k.s9], crc32tab[k.sa], crc32tab[k.sb], crc32tab[k.sc], crc32tab[k.sd], crc32tab[k.se], crc32tab[k.sf]);
   #endif
 
   a ^= s;
diff --git a/OpenCL/m11500_a3-optimized.cl b/OpenCL/m11500_a3-optimized.cl
index 70af7777a..d10b09401 100644
--- a/OpenCL/m11500_a3-optimized.cl
+++ b/OpenCL/m11500_a3-optimized.cl
@@ -9,11 +9,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u32a crc32tab[0x100] =
+CONSTANT_VK u32a crc32tab[0x100] =
 {
   0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
   0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
@@ -88,15 +89,15 @@ DECLSPEC u32x round_crc32 (u32x a, const u32x v)
   const u32x s = a >> 8;
 
   #if   VECT_SIZE == 1
-  a = (u32x) crc32tab[k];
+  a = make_u32x crc32tab[k];
   #elif VECT_SIZE == 2
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1]);
   #elif VECT_SIZE == 4
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3]);
   #elif VECT_SIZE == 8
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7]);
   #elif VECT_SIZE == 16
-  a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7], crc32tab[k.s8], crc32tab[k.s9], crc32tab[k.sa], crc32tab[k.sb], crc32tab[k.sc], crc32tab[k.sd], crc32tab[k.se], crc32tab[k.sf]);
+  a = make_u32x (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7], crc32tab[k.s8], crc32tab[k.s9], crc32tab[k.sa], crc32tab[k.sb], crc32tab[k.sc], crc32tab[k.sd], crc32tab[k.se], crc32tab[k.sf]);
   #endif
 
   a ^= s;
diff --git a/OpenCL/m11600-pure.cl b/OpenCL/m11600-pure.cl
index ea19df0cb..545a0545c 100644
--- a/OpenCL/m11600-pure.cl
+++ b/OpenCL/m11600-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha256.cl"
 #endif
diff --git a/OpenCL/m11700_a0-optimized.cl b/OpenCL/m11700_a0-optimized.cl
index e42cc57ac..8a474ec3a 100644
--- a/OpenCL/m11700_a0-optimized.cl
+++ b/OpenCL/m11700_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -100,7 +101,7 @@ KERNEL_FQ void m11700_m04 (KERN_ATTR_RULES ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -114,7 +115,7 @@ KERNEL_FQ void m11700_m04 (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -258,7 +259,7 @@ KERNEL_FQ void m11700_s04 (KERN_ATTR_RULES ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -272,7 +273,7 @@ KERNEL_FQ void m11700_s04 (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11700_a0-pure.cl b/OpenCL/m11700_a0-pure.cl
index d651caa92..30476e4de 100644
--- a/OpenCL/m11700_a0-pure.cl
+++ b/OpenCL/m11700_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -31,7 +32,7 @@ KERNEL_FQ void m11700_mxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -45,7 +46,7 @@ KERNEL_FQ void m11700_mxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -104,7 +105,7 @@ KERNEL_FQ void m11700_sxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -118,7 +119,7 @@ KERNEL_FQ void m11700_sxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11700_a1-optimized.cl b/OpenCL/m11700_a1-optimized.cl
index 982e53128..3df9793a0 100644
--- a/OpenCL/m11700_a1-optimized.cl
+++ b/OpenCL/m11700_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog256.cl"
@@ -98,7 +99,7 @@ KERNEL_FQ void m11700_m04 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -112,7 +113,7 @@ KERNEL_FQ void m11700_m04 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -314,7 +315,7 @@ KERNEL_FQ void m11700_s04 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -328,7 +329,7 @@ KERNEL_FQ void m11700_s04 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11700_a1-pure.cl b/OpenCL/m11700_a1-pure.cl
index 4046115f1..703f3a2b8 100644
--- a/OpenCL/m11700_a1-pure.cl
+++ b/OpenCL/m11700_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_streebog256.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11700_mxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11700_mxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -100,7 +101,7 @@ KERNEL_FQ void m11700_sxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -114,7 +115,7 @@ KERNEL_FQ void m11700_sxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11700_a3-optimized.cl b/OpenCL/m11700_a3-optimized.cl
index 7a13ed762..490d882ee 100644
--- a/OpenCL/m11700_a3-optimized.cl
+++ b/OpenCL/m11700_a3-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog256.cl"
@@ -267,7 +268,7 @@ KERNEL_FQ void m11700_m04 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -281,7 +282,7 @@ KERNEL_FQ void m11700_m04 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -331,7 +332,7 @@ KERNEL_FQ void m11700_m08 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -345,7 +346,7 @@ KERNEL_FQ void m11700_m08 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -395,7 +396,7 @@ KERNEL_FQ void m11700_m16 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -409,7 +410,7 @@ KERNEL_FQ void m11700_m16 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -459,7 +460,7 @@ KERNEL_FQ void m11700_s04 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -473,7 +474,7 @@ KERNEL_FQ void m11700_s04 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -523,7 +524,7 @@ KERNEL_FQ void m11700_s08 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -537,7 +538,7 @@ KERNEL_FQ void m11700_s08 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -587,7 +588,7 @@ KERNEL_FQ void m11700_s16 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -601,7 +602,7 @@ KERNEL_FQ void m11700_s16 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11700_a3-pure.cl b/OpenCL/m11700_a3-pure.cl
index 46988ca46..e37e72a1b 100644
--- a/OpenCL/m11700_a3-pure.cl
+++ b/OpenCL/m11700_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog256.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11700_mxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11700_mxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -113,7 +114,7 @@ KERNEL_FQ void m11700_sxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -127,7 +128,7 @@ KERNEL_FQ void m11700_sxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11750_a0-pure.cl b/OpenCL/m11750_a0-pure.cl
index 8f1a4f03e..d7286cf90 100644
--- a/OpenCL/m11750_a0-pure.cl
+++ b/OpenCL/m11750_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -31,7 +32,7 @@ KERNEL_FQ void m11750_mxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -45,7 +46,7 @@ KERNEL_FQ void m11750_mxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -113,7 +114,7 @@ KERNEL_FQ void m11750_sxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -127,7 +128,7 @@ KERNEL_FQ void m11750_sxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11750_a1-pure.cl b/OpenCL/m11750_a1-pure.cl
index fef704bb5..c2499b6fe 100644
--- a/OpenCL/m11750_a1-pure.cl
+++ b/OpenCL/m11750_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_streebog256.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11750_mxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11750_mxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -136,7 +137,7 @@ KERNEL_FQ void m11750_sxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -150,7 +151,7 @@ KERNEL_FQ void m11750_sxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11750_a3-pure.cl b/OpenCL/m11750_a3-pure.cl
index 389818a4f..6d0d5eeae 100644
--- a/OpenCL/m11750_a3-pure.cl
+++ b/OpenCL/m11750_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog256.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11750_mxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11750_mxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -122,7 +123,7 @@ KERNEL_FQ void m11750_sxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -136,7 +137,7 @@ KERNEL_FQ void m11750_sxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11760_a0-pure.cl b/OpenCL/m11760_a0-pure.cl
index 23784cd05..b7759a991 100644
--- a/OpenCL/m11760_a0-pure.cl
+++ b/OpenCL/m11760_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -31,7 +32,7 @@ KERNEL_FQ void m11760_mxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -45,7 +46,7 @@ KERNEL_FQ void m11760_mxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -115,7 +116,7 @@ KERNEL_FQ void m11760_sxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -129,7 +130,7 @@ KERNEL_FQ void m11760_sxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11760_a1-pure.cl b/OpenCL/m11760_a1-pure.cl
index 1f2f7fa41..d123d394c 100644
--- a/OpenCL/m11760_a1-pure.cl
+++ b/OpenCL/m11760_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_streebog256.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11760_mxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11760_mxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -138,7 +139,7 @@ KERNEL_FQ void m11760_sxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -152,7 +153,7 @@ KERNEL_FQ void m11760_sxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11760_a3-pure.cl b/OpenCL/m11760_a3-pure.cl
index 7096f4616..3738364d3 100644
--- a/OpenCL/m11760_a3-pure.cl
+++ b/OpenCL/m11760_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog256.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11760_mxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11760_mxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -124,7 +125,7 @@ KERNEL_FQ void m11760_sxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -138,7 +139,7 @@ KERNEL_FQ void m11760_sxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob256_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11800_a0-optimized.cl b/OpenCL/m11800_a0-optimized.cl
index 30b8ffc46..233a39593 100644
--- a/OpenCL/m11800_a0-optimized.cl
+++ b/OpenCL/m11800_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -100,7 +101,7 @@ KERNEL_FQ void m11800_m04 (KERN_ATTR_RULES ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -114,7 +115,7 @@ KERNEL_FQ void m11800_m04 (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -258,7 +259,7 @@ KERNEL_FQ void m11800_s04 (KERN_ATTR_RULES ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -272,7 +273,7 @@ KERNEL_FQ void m11800_s04 (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11800_a0-pure.cl b/OpenCL/m11800_a0-pure.cl
index e21d9eae6..724ccac23 100644
--- a/OpenCL/m11800_a0-pure.cl
+++ b/OpenCL/m11800_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -31,7 +32,7 @@ KERNEL_FQ void m11800_mxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -45,7 +46,7 @@ KERNEL_FQ void m11800_mxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -104,7 +105,7 @@ KERNEL_FQ void m11800_sxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -118,7 +119,7 @@ KERNEL_FQ void m11800_sxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11800_a1-optimized.cl b/OpenCL/m11800_a1-optimized.cl
index 34846963d..4e082982c 100644
--- a/OpenCL/m11800_a1-optimized.cl
+++ b/OpenCL/m11800_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog512.cl"
@@ -98,7 +99,7 @@ KERNEL_FQ void m11800_m04 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -112,7 +113,7 @@ KERNEL_FQ void m11800_m04 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -314,7 +315,7 @@ KERNEL_FQ void m11800_s04 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -328,7 +329,7 @@ KERNEL_FQ void m11800_s04 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11800_a1-pure.cl b/OpenCL/m11800_a1-pure.cl
index 52cb7bd02..449d8d6e7 100644
--- a/OpenCL/m11800_a1-pure.cl
+++ b/OpenCL/m11800_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_streebog512.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11800_mxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11800_mxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -100,7 +101,7 @@ KERNEL_FQ void m11800_sxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -114,7 +115,7 @@ KERNEL_FQ void m11800_sxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11800_a3-optimized.cl b/OpenCL/m11800_a3-optimized.cl
index 32a2746ce..77d8a99aa 100644
--- a/OpenCL/m11800_a3-optimized.cl
+++ b/OpenCL/m11800_a3-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog512.cl"
@@ -267,7 +268,7 @@ KERNEL_FQ void m11800_m04 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -281,7 +282,7 @@ KERNEL_FQ void m11800_m04 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -331,7 +332,7 @@ KERNEL_FQ void m11800_m08 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -345,7 +346,7 @@ KERNEL_FQ void m11800_m08 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -395,7 +396,7 @@ KERNEL_FQ void m11800_m16 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -409,7 +410,7 @@ KERNEL_FQ void m11800_m16 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -459,7 +460,7 @@ KERNEL_FQ void m11800_s04 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -473,7 +474,7 @@ KERNEL_FQ void m11800_s04 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -523,7 +524,7 @@ KERNEL_FQ void m11800_s08 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -537,7 +538,7 @@ KERNEL_FQ void m11800_s08 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -587,7 +588,7 @@ KERNEL_FQ void m11800_s16 (KERN_ATTR_BASIC ())
    * shared lookup table
    */
 
-  LOCAL_AS u64 s_sbob_sl64[8][256];
+  LOCAL_VK u64 s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -601,7 +602,7 @@ KERNEL_FQ void m11800_s16 (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m11800_a3-pure.cl b/OpenCL/m11800_a3-pure.cl
index 12d1899c7..101d13daa 100644
--- a/OpenCL/m11800_a3-pure.cl
+++ b/OpenCL/m11800_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog512.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11800_mxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11800_mxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -113,7 +114,7 @@ KERNEL_FQ void m11800_sxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -127,7 +128,7 @@ KERNEL_FQ void m11800_sxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11850_a0-pure.cl b/OpenCL/m11850_a0-pure.cl
index 703ad9ac9..8182969d4 100644
--- a/OpenCL/m11850_a0-pure.cl
+++ b/OpenCL/m11850_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -31,7 +32,7 @@ KERNEL_FQ void m11850_mxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -45,7 +46,7 @@ KERNEL_FQ void m11850_mxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -113,7 +114,7 @@ KERNEL_FQ void m11850_sxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -127,7 +128,7 @@ KERNEL_FQ void m11850_sxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11850_a1-pure.cl b/OpenCL/m11850_a1-pure.cl
index 6f8c93e1b..6fd6ad1d9 100644
--- a/OpenCL/m11850_a1-pure.cl
+++ b/OpenCL/m11850_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_streebog512.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11850_mxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11850_mxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -136,7 +137,7 @@ KERNEL_FQ void m11850_sxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -150,7 +151,7 @@ KERNEL_FQ void m11850_sxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11850_a3-pure.cl b/OpenCL/m11850_a3-pure.cl
index b273a6ef4..e17e32cb6 100644
--- a/OpenCL/m11850_a3-pure.cl
+++ b/OpenCL/m11850_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog512.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11850_mxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11850_mxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -122,7 +123,7 @@ KERNEL_FQ void m11850_sxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -136,7 +137,7 @@ KERNEL_FQ void m11850_sxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11860_a0-pure.cl b/OpenCL/m11860_a0-pure.cl
index 0f20573ea..5f89f576f 100644
--- a/OpenCL/m11860_a0-pure.cl
+++ b/OpenCL/m11860_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -31,7 +32,7 @@ KERNEL_FQ void m11860_mxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -45,7 +46,7 @@ KERNEL_FQ void m11860_mxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -115,7 +116,7 @@ KERNEL_FQ void m11860_sxx (KERN_ATTR_RULES ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -129,7 +130,7 @@ KERNEL_FQ void m11860_sxx (KERN_ATTR_RULES ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11860_a1-pure.cl b/OpenCL/m11860_a1-pure.cl
index 272f8a61c..8a14bfa14 100644
--- a/OpenCL/m11860_a1-pure.cl
+++ b/OpenCL/m11860_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_streebog512.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11860_mxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11860_mxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -138,7 +139,7 @@ KERNEL_FQ void m11860_sxx (KERN_ATTR_BASIC ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -152,7 +153,7 @@ KERNEL_FQ void m11860_sxx (KERN_ATTR_BASIC ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11860_a3-pure.cl b/OpenCL/m11860_a3-pure.cl
index 7c54204be..e8d57aff9 100644
--- a/OpenCL/m11860_a3-pure.cl
+++ b/OpenCL/m11860_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog512.cl"
@@ -29,7 +30,7 @@ KERNEL_FQ void m11860_mxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -43,7 +44,7 @@ KERNEL_FQ void m11860_mxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -124,7 +125,7 @@ KERNEL_FQ void m11860_sxx (KERN_ATTR_VECTOR ())
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -138,7 +139,7 @@ KERNEL_FQ void m11860_sxx (KERN_ATTR_VECTOR ())
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m11900-pure.cl b/OpenCL/m11900-pure.cl
index b46cca1cb..4fdac642d 100644
--- a/OpenCL/m11900-pure.cl
+++ b/OpenCL/m11900-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m12000-pure.cl b/OpenCL/m12000-pure.cl
index 43fc2ac54..4a2d4f176 100644
--- a/OpenCL/m12000-pure.cl
+++ b/OpenCL/m12000-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m12200-pure.cl b/OpenCL/m12200-pure.cl
index 3aee89aa0..2e73aa0db 100644
--- a/OpenCL/m12200-pure.cl
+++ b/OpenCL/m12200-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m12300-pure.cl b/OpenCL/m12300-pure.cl
index 4b84dcab0..ae92645e7 100644
--- a/OpenCL/m12300-pure.cl
+++ b/OpenCL/m12300-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m12400-pure.cl b/OpenCL/m12400-pure.cl
index d0116b0ac..7e913a518 100644
--- a/OpenCL/m12400-pure.cl
+++ b/OpenCL/m12400-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #endif
 
@@ -59,7 +60,7 @@ typedef struct bsdicrypt_tmp
   PERM_OP (l, r, tt,  4, 0x0f0f0f0f);  \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x00820200, 0x00020000, 0x80800000, 0x80820200,
@@ -207,7 +208,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -510,8 +511,8 @@ KERNEL_FQ void m12400_init (KERN_ATTR_TMPS (bsdicrypt_tmp_t))
    * sbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -534,7 +535,7 @@ KERNEL_FQ void m12400_init (KERN_ATTR_TMPS (bsdicrypt_tmp_t))
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -646,8 +647,8 @@ KERNEL_FQ void m12400_loop (KERN_ATTR_TMPS (bsdicrypt_tmp_t))
    * sbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -670,7 +671,7 @@ KERNEL_FQ void m12400_loop (KERN_ATTR_TMPS (bsdicrypt_tmp_t))
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m12500-pure.cl b/OpenCL/m12500-pure.cl
index 94d525cf8..dbcd97121 100644
--- a/OpenCL/m12500-pure.cl
+++ b/OpenCL/m12500-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha1.cl"
 #include "inc_cipher_aes.cl"
@@ -291,17 +292,17 @@ KERNEL_FQ void m12500_comp (KERN_ATTR_TMPS_ESALT (rar3_tmp_t, pbkdf2_sha1_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -318,7 +319,7 @@ KERNEL_FQ void m12500_comp (KERN_ATTR_TMPS_ESALT (rar3_tmp_t, pbkdf2_sha1_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m12600_a0-optimized.cl b/OpenCL/m12600_a0-optimized.cl
index 6106316ec..afc21b471 100644
--- a/OpenCL/m12600_a0-optimized.cl
+++ b/OpenCL/m12600_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -17,15 +18,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m12600_m04 (KERN_ATTR_RULES ())
@@ -42,7 +43,7 @@ KERNEL_FQ void m12600_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -53,7 +54,7 @@ KERNEL_FQ void m12600_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -382,7 +383,7 @@ KERNEL_FQ void m12600_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -393,7 +394,7 @@ KERNEL_FQ void m12600_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m12600_a0-pure.cl b/OpenCL/m12600_a0-pure.cl
index d2f8fb7b3..7222b668b 100644
--- a/OpenCL/m12600_a0-pure.cl
+++ b/OpenCL/m12600_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -17,15 +18,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m12600_mxx (KERN_ATTR_RULES ())
@@ -42,7 +43,7 @@ KERNEL_FQ void m12600_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -53,7 +54,7 @@ KERNEL_FQ void m12600_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -178,7 +179,7 @@ KERNEL_FQ void m12600_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -189,7 +190,7 @@ KERNEL_FQ void m12600_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m12600_a1-optimized.cl b/OpenCL/m12600_a1-optimized.cl
index 728bc49a5..fe78a19cc 100644
--- a/OpenCL/m12600_a1-optimized.cl
+++ b/OpenCL/m12600_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m12600_m04 (KERN_ATTR_BASIC ())
@@ -40,7 +41,7 @@ KERNEL_FQ void m12600_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -51,7 +52,7 @@ KERNEL_FQ void m12600_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -438,7 +439,7 @@ KERNEL_FQ void m12600_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -449,7 +450,7 @@ KERNEL_FQ void m12600_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m12600_a1-pure.cl b/OpenCL/m12600_a1-pure.cl
index 6518aaca0..c861a06df 100644
--- a/OpenCL/m12600_a1-pure.cl
+++ b/OpenCL/m12600_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m12600_mxx (KERN_ATTR_BASIC ())
@@ -40,7 +41,7 @@ KERNEL_FQ void m12600_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -51,7 +52,7 @@ KERNEL_FQ void m12600_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -174,7 +175,7 @@ KERNEL_FQ void m12600_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -185,7 +186,7 @@ KERNEL_FQ void m12600_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m12600_a3-optimized.cl b/OpenCL/m12600_a3-optimized.cl
index e42a0754b..88310a835 100644
--- a/OpenCL/m12600_a3-optimized.cl
+++ b/OpenCL/m12600_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_upper8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m12600m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -637,7 +638,7 @@ KERNEL_FQ void m12600_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -648,7 +649,7 @@ KERNEL_FQ void m12600_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -707,7 +708,7 @@ KERNEL_FQ void m12600_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -718,7 +719,7 @@ KERNEL_FQ void m12600_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -777,7 +778,7 @@ KERNEL_FQ void m12600_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -788,7 +789,7 @@ KERNEL_FQ void m12600_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -847,7 +848,7 @@ KERNEL_FQ void m12600_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -858,7 +859,7 @@ KERNEL_FQ void m12600_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -917,7 +918,7 @@ KERNEL_FQ void m12600_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -928,7 +929,7 @@ KERNEL_FQ void m12600_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -987,7 +988,7 @@ KERNEL_FQ void m12600_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -998,7 +999,7 @@ KERNEL_FQ void m12600_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m12600_a3-pure.cl b/OpenCL/m12600_a3-pure.cl
index d0e80b99f..880a9adc4 100644
--- a/OpenCL/m12600_a3-pure.cl
+++ b/OpenCL/m12600_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m12600_mxx (KERN_ATTR_VECTOR ())
@@ -40,7 +41,7 @@ KERNEL_FQ void m12600_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -51,7 +52,7 @@ KERNEL_FQ void m12600_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -187,7 +188,7 @@ KERNEL_FQ void m12600_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -198,7 +199,7 @@ KERNEL_FQ void m12600_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m12700-pure.cl b/OpenCL/m12700-pure.cl
index 3d2638062..8fc00d476 100644
--- a/OpenCL/m12700-pure.cl
+++ b/OpenCL/m12700-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -249,17 +250,17 @@ KERNEL_FQ void m12700_comp (KERN_ATTR_TMPS (mywallet_tmp_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -276,7 +277,7 @@ KERNEL_FQ void m12700_comp (KERN_ATTR_TMPS (mywallet_tmp_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m12800-pure.cl b/OpenCL/m12800-pure.cl
index 0c32d564f..702932163 100644
--- a/OpenCL/m12800-pure.cl
+++ b/OpenCL/m12800-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -89,7 +90,7 @@ KERNEL_FQ void m12800_init (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh
    * lookup ascii table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -100,7 +101,7 @@ KERNEL_FQ void m12800_init (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh
                  | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m12900-pure.cl b/OpenCL/m12900-pure.cl
index 095eaae52..b575ad561 100644
--- a/OpenCL/m12900-pure.cl
+++ b/OpenCL/m12900-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m13000-pure.cl b/OpenCL/m13000-pure.cl
index a9b56c1c4..993fdba1d 100644
--- a/OpenCL/m13000-pure.cl
+++ b/OpenCL/m13000-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m13100_a0-optimized.cl b/OpenCL/m13100_a0-optimized.cl
index a42ff481e..b9465f730 100644
--- a/OpenCL/m13100_a0-optimized.cl
+++ b/OpenCL/m13100_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -609,7 +610,7 @@ KERNEL_FQ void m13100_m04 (KERN_ATTR_RULES_ESALT (krb5tgs_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -707,7 +708,7 @@ KERNEL_FQ void m13100_s04 (KERN_ATTR_RULES_ESALT (krb5tgs_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m13100_a0-pure.cl b/OpenCL/m13100_a0-pure.cl
index 77e57e970..7a41226ef 100644
--- a/OpenCL/m13100_a0-pure.cl
+++ b/OpenCL/m13100_a0-pure.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -403,7 +404,7 @@ KERNEL_FQ void m13100_mxx (KERN_ATTR_RULES_ESALT (krb5tgs_t))
 
   COPY_PW (pws[gid]);
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -465,7 +466,7 @@ KERNEL_FQ void m13100_sxx (KERN_ATTR_RULES_ESALT (krb5tgs_t))
 
   COPY_PW (pws[gid]);
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m13100_a1-optimized.cl b/OpenCL/m13100_a1-optimized.cl
index c153b1033..a548a7bf1 100644
--- a/OpenCL/m13100_a1-optimized.cl
+++ b/OpenCL/m13100_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -606,7 +607,7 @@ KERNEL_FQ void m13100_m04 (KERN_ATTR_ESALT (krb5tgs_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -753,7 +754,7 @@ KERNEL_FQ void m13100_s04 (KERN_ATTR_ESALT (krb5tgs_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m13100_a1-pure.cl b/OpenCL/m13100_a1-pure.cl
index c84fab54e..a8d0098b3 100644
--- a/OpenCL/m13100_a1-pure.cl
+++ b/OpenCL/m13100_a1-pure.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md4.cl"
 #include "inc_hash_md5.cl"
@@ -399,7 +400,7 @@ KERNEL_FQ void m13100_mxx (KERN_ATTR_ESALT (krb5tgs_t))
    * base
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -459,7 +460,7 @@ KERNEL_FQ void m13100_sxx (KERN_ATTR_ESALT (krb5tgs_t))
    * base
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m13100_a3-optimized.cl b/OpenCL/m13100_a3-optimized.cl
index 81b94bf9b..6079988b0 100644
--- a/OpenCL/m13100_a3-optimized.cl
+++ b/OpenCL/m13100_a3-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -676,7 +677,7 @@ KERNEL_FQ void m13100_m04 (KERN_ATTR_ESALT (krb5tgs_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -728,7 +729,7 @@ KERNEL_FQ void m13100_m08 (KERN_ATTR_ESALT (krb5tgs_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -784,7 +785,7 @@ KERNEL_FQ void m13100_s04 (KERN_ATTR_ESALT (krb5tgs_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -836,7 +837,7 @@ KERNEL_FQ void m13100_s08 (KERN_ATTR_ESALT (krb5tgs_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m13100_a3-pure.cl b/OpenCL/m13100_a3-pure.cl
index a6be7319a..25e60e0c5 100644
--- a/OpenCL/m13100_a3-pure.cl
+++ b/OpenCL/m13100_a3-pure.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md4.cl"
 #include "inc_hash_md5.cl"
@@ -408,7 +409,7 @@ KERNEL_FQ void m13100_mxx (KERN_ATTR_VECTOR_ESALT (krb5tgs_t))
     w[idx] = pws[gid].i[idx];
   }
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -481,7 +482,7 @@ KERNEL_FQ void m13100_sxx (KERN_ATTR_VECTOR_ESALT (krb5tgs_t))
     w[idx] = pws[gid].i[idx];
   }
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m13200-pure.cl b/OpenCL/m13200-pure.cl
index 41f912b0b..68e7d6048 100644
--- a/OpenCL/m13200-pure.cl
+++ b/OpenCL/m13200-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha1.cl"
 #include "inc_cipher_aes.cl"
@@ -85,17 +86,17 @@ KERNEL_FQ void m13200_loop (KERN_ATTR_TMPS (axcrypt_tmp_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -112,7 +113,7 @@ KERNEL_FQ void m13200_loop (KERN_ATTR_TMPS (axcrypt_tmp_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13300_a0-optimized.cl b/OpenCL/m13300_a0-optimized.cl
index 42cca5d27..8fb4398ad 100644
--- a/OpenCL/m13300_a0-optimized.cl
+++ b/OpenCL/m13300_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m13300_a0-pure.cl b/OpenCL/m13300_a0-pure.cl
index 9a35265d6..7e91986fa 100644
--- a/OpenCL/m13300_a0-pure.cl
+++ b/OpenCL/m13300_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m13300_a1-optimized.cl b/OpenCL/m13300_a1-optimized.cl
index 547119476..450a66cf2 100644
--- a/OpenCL/m13300_a1-optimized.cl
+++ b/OpenCL/m13300_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m13300_a1-pure.cl b/OpenCL/m13300_a1-pure.cl
index a8e24f994..106027d74 100644
--- a/OpenCL/m13300_a1-pure.cl
+++ b/OpenCL/m13300_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m13300_a3-optimized.cl b/OpenCL/m13300_a3-optimized.cl
index 4e748dbcd..2b3f91812 100644
--- a/OpenCL/m13300_a3-optimized.cl
+++ b/OpenCL/m13300_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m13300_a3-pure.cl b/OpenCL/m13300_a3-pure.cl
index 4657b5eb0..924c4165a 100644
--- a/OpenCL/m13300_a3-pure.cl
+++ b/OpenCL/m13300_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m13400-pure.cl b/OpenCL/m13400-pure.cl
index d77ecb680..1ad66422f 100644
--- a/OpenCL/m13400-pure.cl
+++ b/OpenCL/m13400-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha256.cl"
 #include "inc_cipher_aes.cl"
@@ -175,11 +176,11 @@ KERNEL_FQ void m13400_loop (KERN_ATTR_TMPS_ESALT (keepass_tmp_t, keepass_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -190,7 +191,7 @@ KERNEL_FQ void m13400_loop (KERN_ATTR_TMPS_ESALT (keepass_tmp_t, keepass_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -263,17 +264,17 @@ KERNEL_FQ void m13400_comp (KERN_ATTR_TMPS_ESALT (keepass_tmp_t, keepass_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -290,7 +291,7 @@ KERNEL_FQ void m13400_comp (KERN_ATTR_TMPS_ESALT (keepass_tmp_t, keepass_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13500_a0-optimized.cl b/OpenCL/m13500_a0-optimized.cl
index bd3268abf..11f1c1805 100644
--- a/OpenCL/m13500_a0-optimized.cl
+++ b/OpenCL/m13500_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m13500_a0-pure.cl b/OpenCL/m13500_a0-pure.cl
index 4c2d6e80c..b8102e111 100644
--- a/OpenCL/m13500_a0-pure.cl
+++ b/OpenCL/m13500_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m13500_a1-optimized.cl b/OpenCL/m13500_a1-optimized.cl
index ddaa67441..1df3d887a 100644
--- a/OpenCL/m13500_a1-optimized.cl
+++ b/OpenCL/m13500_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m13500_a1-pure.cl b/OpenCL/m13500_a1-pure.cl
index 0e9bc3475..8443f38ca 100644
--- a/OpenCL/m13500_a1-pure.cl
+++ b/OpenCL/m13500_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m13500_a3-optimized.cl b/OpenCL/m13500_a3-optimized.cl
index 7ab0f7f89..ce33398f5 100644
--- a/OpenCL/m13500_a3-optimized.cl
+++ b/OpenCL/m13500_a3-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m13500_a3-pure.cl b/OpenCL/m13500_a3-pure.cl
index 8c0cf3d5d..19865585b 100644
--- a/OpenCL/m13500_a3-pure.cl
+++ b/OpenCL/m13500_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m13600-pure.cl b/OpenCL/m13600-pure.cl
index 4846a3c27..944215e82 100644
--- a/OpenCL/m13600-pure.cl
+++ b/OpenCL/m13600-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m13711-pure.cl b/OpenCL/m13711-pure.cl
index d3f29166e..a74b318a6 100644
--- a/OpenCL/m13711-pure.cl
+++ b/OpenCL/m13711-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_ripemd160.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key2[6] = key[14];
   key2[7] = key[15];
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -133,14 +134,14 @@ KERNEL_FQ void m13711_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -260,17 +261,17 @@ KERNEL_FQ void m13711_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -287,7 +288,7 @@ KERNEL_FQ void m13711_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -444,17 +445,17 @@ KERNEL_FQ void m13711_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -471,7 +472,7 @@ KERNEL_FQ void m13711_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13712-pure.cl b/OpenCL/m13712-pure.cl
index 67b09a3e8..d754ae7df 100644
--- a/OpenCL/m13712-pure.cl
+++ b/OpenCL/m13712-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_ripemd160.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key2[6] = key[14];
   key2[7] = key[15];
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -125,13 +126,13 @@ DECLSPEC int check_header_1024 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key4[6] = key[30];
   key4[7] = key[31];
 
-  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_twofish_serpent     (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_camellia_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
   if (verify_header_camellia_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
-  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_twofish  (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -184,14 +185,14 @@ KERNEL_FQ void m13712_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -311,17 +312,17 @@ KERNEL_FQ void m13712_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -338,7 +339,7 @@ KERNEL_FQ void m13712_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -496,17 +497,17 @@ KERNEL_FQ void m13712_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -523,7 +524,7 @@ KERNEL_FQ void m13712_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13713-pure.cl b/OpenCL/m13713-pure.cl
index d6ea2a0e6..db6dc593f 100644
--- a/OpenCL/m13713-pure.cl
+++ b/OpenCL/m13713-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_ripemd160.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key2[6] = key[14];
   key2[7] = key[15];
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -125,13 +126,13 @@ DECLSPEC int check_header_1024 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key4[6] = key[30];
   key4[7] = key[31];
 
-  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_twofish_serpent     (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_camellia_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
   if (verify_header_camellia_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
-  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_twofish  (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -194,9 +195,9 @@ DECLSPEC int check_header_1536 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key6[6] = key[46];
   key6[7] = key[47];
 
-  if (verify_header_aes_twofish_serpent         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_twofish_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_serpent_camellia (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6) == 1) return 0;
+  if (verify_header_aes_twofish_serpent         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -249,14 +250,14 @@ KERNEL_FQ void m13713_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -376,17 +377,17 @@ KERNEL_FQ void m13713_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -403,7 +404,7 @@ KERNEL_FQ void m13713_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -562,17 +563,17 @@ KERNEL_FQ void m13713_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -589,7 +590,7 @@ KERNEL_FQ void m13713_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13721-pure.cl b/OpenCL/m13721-pure.cl
index 55f3df252..3dbabb69d 100644
--- a/OpenCL/m13721-pure.cl
+++ b/OpenCL/m13721-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key2[6] = hc_swap32_S (h32_from_64_S (key[7]));
   key2[7] = hc_swap32_S (l32_from_64_S (key[7]));
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -155,14 +156,14 @@ KERNEL_FQ void m13721_init (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -363,17 +364,17 @@ KERNEL_FQ void m13721_loop (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -390,7 +391,7 @@ KERNEL_FQ void m13721_loop (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -591,17 +592,17 @@ KERNEL_FQ void m13721_comp (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -618,7 +619,7 @@ KERNEL_FQ void m13721_comp (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13722-pure.cl b/OpenCL/m13722-pure.cl
index 2e944cb50..9b74f8d41 100644
--- a/OpenCL/m13722-pure.cl
+++ b/OpenCL/m13722-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key2[6] = hc_swap32_S (h32_from_64_S (key[7]));
   key2[7] = hc_swap32_S (l32_from_64_S (key[7]));
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -125,13 +126,13 @@ DECLSPEC int check_header_1024 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key4[6] = hc_swap32_S (h32_from_64_S (key[15]));
   key4[7] = hc_swap32_S (l32_from_64_S (key[15]));
 
-  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_twofish_serpent     (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_camellia_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
   if (verify_header_camellia_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
-  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_twofish  (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -206,14 +207,14 @@ KERNEL_FQ void m13722_init (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -414,17 +415,17 @@ KERNEL_FQ void m13722_loop (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -441,7 +442,7 @@ KERNEL_FQ void m13722_loop (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -643,17 +644,17 @@ KERNEL_FQ void m13722_comp (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -670,7 +671,7 @@ KERNEL_FQ void m13722_comp (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13723-pure.cl b/OpenCL/m13723-pure.cl
index c1dd74f80..ec722c387 100644
--- a/OpenCL/m13723-pure.cl
+++ b/OpenCL/m13723-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key2[6] = hc_swap32_S (h32_from_64_S (key[7]));
   key2[7] = hc_swap32_S (l32_from_64_S (key[7]));
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -125,13 +126,13 @@ DECLSPEC int check_header_1024 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key4[6] = hc_swap32_S (h32_from_64_S (key[15]));
   key4[7] = hc_swap32_S (l32_from_64_S (key[15]));
 
-  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_twofish_serpent     (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_camellia_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
   if (verify_header_camellia_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
-  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_twofish  (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -194,9 +195,9 @@ DECLSPEC int check_header_1536 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key6[6] = hc_swap32_S (h32_from_64_S (key[23]));
   key6[7] = hc_swap32_S (l32_from_64_S (key[23]));
 
-  if (verify_header_aes_twofish_serpent         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_twofish_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_serpent_camellia (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6) == 1) return 0;
+  if (verify_header_aes_twofish_serpent         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -271,14 +272,14 @@ KERNEL_FQ void m13723_init (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -479,17 +480,17 @@ KERNEL_FQ void m13723_loop (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -506,7 +507,7 @@ KERNEL_FQ void m13723_loop (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -709,17 +710,17 @@ KERNEL_FQ void m13723_comp (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -736,7 +737,7 @@ KERNEL_FQ void m13723_comp (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13731-pure.cl b/OpenCL/m13731-pure.cl
index 24d9cd1de..b729f07a5 100644
--- a/OpenCL/m13731-pure.cl
+++ b/OpenCL/m13731-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_whirlpool.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key2[6] = hc_swap32_S (key[14]);
   key2[7] = hc_swap32_S (key[15]);
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -193,14 +194,14 @@ KERNEL_FQ void m13731_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   /**
    * Whirlpool shared
@@ -208,8 +209,8 @@ KERNEL_FQ void m13731_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -232,7 +233,7 @@ KERNEL_FQ void m13731_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -420,17 +421,17 @@ KERNEL_FQ void m13731_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -447,7 +448,7 @@ KERNEL_FQ void m13731_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -471,8 +472,8 @@ KERNEL_FQ void m13731_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -495,7 +496,7 @@ KERNEL_FQ void m13731_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -731,17 +732,17 @@ KERNEL_FQ void m13731_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -758,7 +759,7 @@ KERNEL_FQ void m13731_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -782,8 +783,8 @@ KERNEL_FQ void m13731_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -806,7 +807,7 @@ KERNEL_FQ void m13731_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13732-pure.cl b/OpenCL/m13732-pure.cl
index a5e7ee9b9..057efa707 100644
--- a/OpenCL/m13732-pure.cl
+++ b/OpenCL/m13732-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_whirlpool.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key2[6] = hc_swap32_S (key[14]);
   key2[7] = hc_swap32_S (key[15]);
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -125,13 +126,13 @@ DECLSPEC int check_header_1024 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key4[6] = hc_swap32_S (key[30]);
   key4[7] = hc_swap32_S (key[31]);
 
-  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_twofish_serpent     (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_camellia_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
   if (verify_header_camellia_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
-  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_twofish  (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -244,14 +245,14 @@ KERNEL_FQ void m13732_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   /**
    * Whirlpool shared
@@ -259,8 +260,8 @@ KERNEL_FQ void m13732_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -283,7 +284,7 @@ KERNEL_FQ void m13732_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -471,17 +472,17 @@ KERNEL_FQ void m13732_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -498,7 +499,7 @@ KERNEL_FQ void m13732_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -522,8 +523,8 @@ KERNEL_FQ void m13732_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -546,7 +547,7 @@ KERNEL_FQ void m13732_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -783,17 +784,17 @@ KERNEL_FQ void m13732_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -810,7 +811,7 @@ KERNEL_FQ void m13732_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -834,8 +835,8 @@ KERNEL_FQ void m13732_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -858,7 +859,7 @@ KERNEL_FQ void m13732_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13733-pure.cl b/OpenCL/m13733-pure.cl
index 38f8060e3..79c8767cd 100644
--- a/OpenCL/m13733-pure.cl
+++ b/OpenCL/m13733-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_whirlpool.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key2[6] = hc_swap32_S (key[14]);
   key2[7] = hc_swap32_S (key[15]);
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -125,13 +126,13 @@ DECLSPEC int check_header_1024 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key4[6] = hc_swap32_S (key[30]);
   key4[7] = hc_swap32_S (key[31]);
 
-  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_twofish_serpent     (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_camellia_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
   if (verify_header_camellia_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
-  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_twofish  (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -194,9 +195,9 @@ DECLSPEC int check_header_1536 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key6[6] = hc_swap32_S (key[46]);
   key6[7] = hc_swap32_S (key[47]);
 
-  if (verify_header_aes_twofish_serpent         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_twofish_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_serpent_camellia (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6) == 1) return 0;
+  if (verify_header_aes_twofish_serpent         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -309,14 +310,14 @@ KERNEL_FQ void m13733_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   /**
    * Whirlpool shared
@@ -324,8 +325,8 @@ KERNEL_FQ void m13733_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -348,7 +349,7 @@ KERNEL_FQ void m13733_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -536,17 +537,17 @@ KERNEL_FQ void m13733_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -563,7 +564,7 @@ KERNEL_FQ void m13733_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -587,8 +588,8 @@ KERNEL_FQ void m13733_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -611,7 +612,7 @@ KERNEL_FQ void m13733_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -849,17 +850,17 @@ KERNEL_FQ void m13733_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -876,7 +877,7 @@ KERNEL_FQ void m13733_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -900,8 +901,8 @@ KERNEL_FQ void m13733_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_Ch[8][256];
-  LOCAL_AS u32 s_Cl[8][256];
+  LOCAL_VK u32 s_Ch[8][256];
+  LOCAL_VK u32 s_Cl[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -924,7 +925,7 @@ KERNEL_FQ void m13733_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_Cl[7][i] = Cl[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13751-pure.cl b/OpenCL/m13751-pure.cl
index 8ff4e0717..405d5c277 100644
--- a/OpenCL/m13751-pure.cl
+++ b/OpenCL/m13751-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key2[6] = hc_swap32_S (key[14]);
   key2[7] = hc_swap32_S (key[15]);
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -139,14 +140,14 @@ KERNEL_FQ void m13751_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -295,17 +296,17 @@ KERNEL_FQ void m13751_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -322,7 +323,7 @@ KERNEL_FQ void m13751_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -532,17 +533,17 @@ KERNEL_FQ void m13751_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -559,7 +560,7 @@ KERNEL_FQ void m13751_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13752-pure.cl b/OpenCL/m13752-pure.cl
index 135ead320..fbc4c5574 100644
--- a/OpenCL/m13752-pure.cl
+++ b/OpenCL/m13752-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key2[6] = hc_swap32_S (key[14]);
   key2[7] = hc_swap32_S (key[15]);
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -125,13 +126,13 @@ DECLSPEC int check_header_1024 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key4[6] = hc_swap32_S (key[30]);
   key4[7] = hc_swap32_S (key[31]);
 
-  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_twofish_serpent     (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_camellia_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
   if (verify_header_camellia_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
-  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_twofish  (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -190,14 +191,14 @@ KERNEL_FQ void m13752_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -346,17 +347,17 @@ KERNEL_FQ void m13752_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -373,7 +374,7 @@ KERNEL_FQ void m13752_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -555,17 +556,17 @@ KERNEL_FQ void m13752_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -582,7 +583,7 @@ KERNEL_FQ void m13752_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13753-pure.cl b/OpenCL/m13753-pure.cl
index 28c56ee1b..9e7fd752d 100644
--- a/OpenCL/m13753-pure.cl
+++ b/OpenCL/m13753-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -76,11 +77,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key2[6] = hc_swap32_S (key[14]);
   key2[7] = hc_swap32_S (key[15]);
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -125,13 +126,13 @@ DECLSPEC int check_header_1024 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key4[6] = hc_swap32_S (key[30]);
   key4[7] = hc_swap32_S (key[31]);
 
-  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_twofish_serpent     (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_camellia_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
   if (verify_header_camellia_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
-  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_twofish  (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -194,9 +195,9 @@ DECLSPEC int check_header_1536 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u32
   key6[6] = hc_swap32_S (key[46]);
   key6[7] = hc_swap32_S (key[47]);
 
-  if (verify_header_aes_twofish_serpent         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_twofish_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_serpent_camellia (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6) == 1) return 0;
+  if (verify_header_aes_twofish_serpent         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -255,14 +256,14 @@ KERNEL_FQ void m13753_init (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -411,17 +412,17 @@ KERNEL_FQ void m13753_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -438,7 +439,7 @@ KERNEL_FQ void m13753_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -621,17 +622,17 @@ KERNEL_FQ void m13753_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -648,7 +649,7 @@ KERNEL_FQ void m13753_comp (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13771-pure.cl b/OpenCL/m13771-pure.cl
index b3b76052c..821ab8141 100644
--- a/OpenCL/m13771-pure.cl
+++ b/OpenCL/m13771-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog512.cl"
@@ -79,11 +80,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key2[6] = hc_swap32_S (h32_from_64_S (key[0]));
   key2[7] = hc_swap32_S (l32_from_64_S (key[0]));
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -181,18 +182,18 @@ KERNEL_FQ void m13771_init (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -206,7 +207,7 @@ KERNEL_FQ void m13771_init (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -379,17 +380,17 @@ KERNEL_FQ void m13771_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -406,7 +407,7 @@ KERNEL_FQ void m13771_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -420,7 +421,7 @@ KERNEL_FQ void m13771_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -624,17 +625,17 @@ KERNEL_FQ void m13771_comp (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -651,7 +652,7 @@ KERNEL_FQ void m13771_comp (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13772-pure.cl b/OpenCL/m13772-pure.cl
index eb7864d99..7e8ef5b83 100644
--- a/OpenCL/m13772-pure.cl
+++ b/OpenCL/m13772-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog512.cl"
@@ -79,11 +80,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key2[6] = hc_swap32_S (h32_from_64_S (key[0]));
   key2[7] = hc_swap32_S (l32_from_64_S (key[0]));
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -128,13 +129,13 @@ DECLSPEC int check_header_1024 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key4[6] = hc_swap32_S (h32_from_64_S (key[ 8]));
   key4[7] = hc_swap32_S (l32_from_64_S (key[ 8]));
 
-  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_twofish_serpent     (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_camellia_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
   if (verify_header_camellia_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
-  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_twofish  (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -232,18 +233,18 @@ KERNEL_FQ void m13772_init (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -257,7 +258,7 @@ KERNEL_FQ void m13772_init (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -430,17 +431,17 @@ KERNEL_FQ void m13772_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -457,7 +458,7 @@ KERNEL_FQ void m13772_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -471,7 +472,7 @@ KERNEL_FQ void m13772_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -676,17 +677,17 @@ KERNEL_FQ void m13772_comp (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -703,7 +704,7 @@ KERNEL_FQ void m13772_comp (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13773-pure.cl b/OpenCL/m13773-pure.cl
index eea140305..1fad87e9e 100644
--- a/OpenCL/m13773-pure.cl
+++ b/OpenCL/m13773-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_streebog512.cl"
@@ -79,11 +80,11 @@ DECLSPEC int check_header_0512 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key2[6] = hc_swap32_S (h32_from_64_S (key[0]));
   key2[7] = hc_swap32_S (l32_from_64_S (key[0]));
 
-  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_twofish    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_camellia   (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
   if (verify_header_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2) == 1) return 0;
+  if (verify_header_aes        (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -128,13 +129,13 @@ DECLSPEC int check_header_1024 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key4[6] = hc_swap32_S (h32_from_64_S (key[ 8]));
   key4[7] = hc_swap32_S (l32_from_64_S (key[ 8]));
 
-  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_twofish_serpent     (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_aes_twofish         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_camellia_kuznyechik (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
   if (verify_header_camellia_serpent    (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
-  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_twofish  (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4) == 1) return 0;
+  if (verify_header_kuznyechik_aes      (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -197,9 +198,9 @@ DECLSPEC int check_header_1536 (GLOBAL_AS const vc_t *esalt_bufs, GLOBAL_AS u64
   key6[6] = hc_swap32_S (h32_from_64_S (key[16]));
   key6[7] = hc_swap32_S (l32_from_64_S (key[16]));
 
-  if (verify_header_aes_twofish_serpent         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_serpent_twofish_aes         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
   if (verify_header_kuznyechik_serpent_camellia (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6) == 1) return 0;
+  if (verify_header_aes_twofish_serpent         (esalt_bufs[0].data_buf, esalt_bufs[0].signature, key1, key2, key3, key4, key5, key6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1) return 0;
 
   return -1;
 }
@@ -297,18 +298,18 @@ KERNEL_FQ void m13773_init (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
 
   const int keyboard_layout_mapping_cnt = esalt_bufs[digests_offset].keyboard_layout_mapping_cnt;
 
-  LOCAL_AS keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
+  LOCAL_VK keyboard_layout_mapping_t s_keyboard_layout_mapping_buf[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_keyboard_layout_mapping_buf[i] = esalt_bufs[digests_offset].keyboard_layout_mapping_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -322,7 +323,7 @@ KERNEL_FQ void m13773_init (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -495,17 +496,17 @@ KERNEL_FQ void m13773_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -522,7 +523,7 @@ KERNEL_FQ void m13773_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  LOCAL_AS u64a s_sbob_sl64[8][256];
+  LOCAL_VK u64a s_sbob_sl64[8][256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -536,7 +537,7 @@ KERNEL_FQ void m13773_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_sbob_sl64[7][i] = sbob512_sl64[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -742,17 +743,17 @@ KERNEL_FQ void m13773_comp (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -769,7 +770,7 @@ KERNEL_FQ void m13773_comp (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m13800_a0-optimized.cl b/OpenCL/m13800_a0-optimized.cl
index c12745182..0a4f61175 100644
--- a/OpenCL/m13800_a0-optimized.cl
+++ b/OpenCL/m13800_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -434,14 +435,14 @@ KERNEL_FQ void m13800_m04 (KERN_ATTR_RULES_ESALT (win8phone_t))
    * shared
    */
 
-  LOCAL_AS u32 s_esalt[32];
+  LOCAL_VK u32 s_esalt[32];
 
   for (u32 i = lid; i < 32; i += lsz)
   {
     s_esalt[i] = esalt_bufs[digests_offset].salt_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -630,14 +631,14 @@ KERNEL_FQ void m13800_s04 (KERN_ATTR_RULES_ESALT (win8phone_t))
    * shared
    */
 
-  LOCAL_AS u32 s_esalt[32];
+  LOCAL_VK u32 s_esalt[32];
 
   for (u32 i = lid; i < 32; i += lsz)
   {
     s_esalt[i] = esalt_bufs[digests_offset].salt_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m13800_a0-pure.cl b/OpenCL/m13800_a0-pure.cl
index 3546c04ef..1a4061dd3 100644
--- a/OpenCL/m13800_a0-pure.cl
+++ b/OpenCL/m13800_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m13800_a1-optimized.cl b/OpenCL/m13800_a1-optimized.cl
index 3a432d316..8073a941e 100644
--- a/OpenCL/m13800_a1-optimized.cl
+++ b/OpenCL/m13800_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -432,14 +433,14 @@ KERNEL_FQ void m13800_m04 (KERN_ATTR_ESALT (win8phone_t))
    * shared
    */
 
-  LOCAL_AS u32 s_esalt[32];
+  LOCAL_VK u32 s_esalt[32];
 
   for (u32 i = lid; i < 32; i += lsz)
   {
     s_esalt[i] = esalt_bufs[digests_offset].salt_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -684,14 +685,14 @@ KERNEL_FQ void m13800_s04 (KERN_ATTR_ESALT (win8phone_t))
    * shared
    */
 
-  LOCAL_AS u32 s_esalt[32];
+  LOCAL_VK u32 s_esalt[32];
 
   for (u32 i = lid; i < 32; i += lsz)
   {
     s_esalt[i] = esalt_bufs[digests_offset].salt_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m13800_a1-pure.cl b/OpenCL/m13800_a1-pure.cl
index 7e042d817..4bcc63dcc 100644
--- a/OpenCL/m13800_a1-pure.cl
+++ b/OpenCL/m13800_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m13800_a3-optimized.cl b/OpenCL/m13800_a3-optimized.cl
index 46e422df4..692a64e89 100644
--- a/OpenCL/m13800_a3-optimized.cl
+++ b/OpenCL/m13800_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -746,14 +747,14 @@ KERNEL_FQ void m13800_m04 (KERN_ATTR_VECTOR_ESALT (win8phone_t))
    * shared
    */
 
-  LOCAL_AS u32 s_esalt[32];
+  LOCAL_VK u32 s_esalt[32];
 
   for (u32 i = lid; i < 32; i += lsz)
   {
     s_esalt[i] = esalt_bufs[digests_offset].salt_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -803,14 +804,14 @@ KERNEL_FQ void m13800_m08 (KERN_ATTR_VECTOR_ESALT (win8phone_t))
    * shared
    */
 
-  LOCAL_AS u32 s_esalt[32];
+  LOCAL_VK u32 s_esalt[32];
 
   for (u32 i = lid; i < 32; i += lsz)
   {
     s_esalt[i] = esalt_bufs[digests_offset].salt_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -860,14 +861,14 @@ KERNEL_FQ void m13800_m16 (KERN_ATTR_VECTOR_ESALT (win8phone_t))
    * shared
    */
 
-  LOCAL_AS u32 s_esalt[32];
+  LOCAL_VK u32 s_esalt[32];
 
   for (u32 i = lid; i < 32; i += lsz)
   {
     s_esalt[i] = esalt_bufs[digests_offset].salt_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -917,14 +918,14 @@ KERNEL_FQ void m13800_s04 (KERN_ATTR_VECTOR_ESALT (win8phone_t))
    * shared
    */
 
-  LOCAL_AS u32 s_esalt[32];
+  LOCAL_VK u32 s_esalt[32];
 
   for (u32 i = lid; i < 32; i += lsz)
   {
     s_esalt[i] = esalt_bufs[digests_offset].salt_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -974,14 +975,14 @@ KERNEL_FQ void m13800_s08 (KERN_ATTR_VECTOR_ESALT (win8phone_t))
    * shared
    */
 
-  LOCAL_AS u32 s_esalt[32];
+  LOCAL_VK u32 s_esalt[32];
 
   for (u32 i = lid; i < 32; i += lsz)
   {
     s_esalt[i] = esalt_bufs[digests_offset].salt_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -1031,14 +1032,14 @@ KERNEL_FQ void m13800_s16 (KERN_ATTR_VECTOR_ESALT (win8phone_t))
    * shared
    */
 
-  LOCAL_AS u32 s_esalt[32];
+  LOCAL_VK u32 s_esalt[32];
 
   for (u32 i = lid; i < 32; i += lsz)
   {
     s_esalt[i] = esalt_bufs[digests_offset].salt_buf[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m13800_a3-pure.cl b/OpenCL/m13800_a3-pure.cl
index cd2877406..87ac2a02a 100644
--- a/OpenCL/m13800_a3-pure.cl
+++ b/OpenCL/m13800_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m13900_a0-optimized.cl b/OpenCL/m13900_a0-optimized.cl
index 4dce764dc..53e265ae8 100644
--- a/OpenCL/m13900_a0-optimized.cl
+++ b/OpenCL/m13900_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m13900_m04 (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m13900_m04 (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m13900_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -268,7 +269,7 @@ KERNEL_FQ void m13900_s04 (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -279,7 +280,7 @@ KERNEL_FQ void m13900_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m13900_a0-pure.cl b/OpenCL/m13900_a0-pure.cl
index 81ac9fe94..ecc8d6db3 100644
--- a/OpenCL/m13900_a0-pure.cl
+++ b/OpenCL/m13900_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m13900_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m13900_mxx (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m13900_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -192,7 +193,7 @@ KERNEL_FQ void m13900_sxx (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -203,7 +204,7 @@ KERNEL_FQ void m13900_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m13900_a1-optimized.cl b/OpenCL/m13900_a1-optimized.cl
index 3290633d1..514c95913 100644
--- a/OpenCL/m13900_a1-optimized.cl
+++ b/OpenCL/m13900_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m13900_m04 (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m13900_m04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m13900_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -324,7 +325,7 @@ KERNEL_FQ void m13900_s04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -335,7 +336,7 @@ KERNEL_FQ void m13900_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m13900_a1-pure.cl b/OpenCL/m13900_a1-pure.cl
index 92d5eb003..7d34187ea 100644
--- a/OpenCL/m13900_a1-pure.cl
+++ b/OpenCL/m13900_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m13900_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m13900_mxx (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m13900_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -188,7 +189,7 @@ KERNEL_FQ void m13900_sxx (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -199,7 +200,7 @@ KERNEL_FQ void m13900_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m13900_a3-optimized.cl b/OpenCL/m13900_a3-optimized.cl
index 10b4b93f7..21cf1e92a 100644
--- a/OpenCL/m13900_a3-optimized.cl
+++ b/OpenCL/m13900_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void m13900m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC (), LOCAL_AS u32 *l_bin2asc)
@@ -428,7 +429,7 @@ KERNEL_FQ void m13900_m04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -439,7 +440,7 @@ KERNEL_FQ void m13900_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -498,7 +499,7 @@ KERNEL_FQ void m13900_m08 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -509,7 +510,7 @@ KERNEL_FQ void m13900_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -568,7 +569,7 @@ KERNEL_FQ void m13900_m16 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -579,7 +580,7 @@ KERNEL_FQ void m13900_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -638,7 +639,7 @@ KERNEL_FQ void m13900_s04 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -649,7 +650,7 @@ KERNEL_FQ void m13900_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -708,7 +709,7 @@ KERNEL_FQ void m13900_s08 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -719,7 +720,7 @@ KERNEL_FQ void m13900_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -778,7 +779,7 @@ KERNEL_FQ void m13900_s16 (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -789,7 +790,7 @@ KERNEL_FQ void m13900_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m13900_a3-pure.cl b/OpenCL/m13900_a3-pure.cl
index b0e85397d..a0f8a5eb1 100644
--- a/OpenCL/m13900_a3-pure.cl
+++ b/OpenCL/m13900_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m13900_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m13900_mxx (KERN_ATTR_VECTOR ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m13900_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -205,7 +206,7 @@ KERNEL_FQ void m13900_sxx (KERN_ATTR_VECTOR ())
    * shared
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -216,7 +217,7 @@ KERNEL_FQ void m13900_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14000_a0-pure.cl b/OpenCL/m14000_a0-pure.cl
index 2be784b5a..b6fa198cc 100644
--- a/OpenCL/m14000_a0-pure.cl
+++ b/OpenCL/m14000_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -52,7 +53,7 @@
   PERM_OP (l, r, tt,  4, 0x0f0f0f0f);  \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     /* nibble 0 */
@@ -208,7 +209,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -359,25 +360,25 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #if   VECT_SIZE == 1
 #define BOX1(i,S) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32 *iv, u32 *data, u32 *Kc, u32 *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -510,8 +511,8 @@ KERNEL_FQ void m14000_mxx (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -534,7 +535,7 @@ KERNEL_FQ void m14000_mxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -615,8 +616,8 @@ KERNEL_FQ void m14000_sxx (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -639,7 +640,7 @@ KERNEL_FQ void m14000_sxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14000_a1-pure.cl b/OpenCL/m14000_a1-pure.cl
index 38367ff6a..4845c7f47 100644
--- a/OpenCL/m14000_a1-pure.cl
+++ b/OpenCL/m14000_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
@@ -50,7 +51,7 @@
   PERM_OP (l, r, tt,  4, 0x0f0f0f0f);  \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -198,7 +199,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -349,25 +350,25 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #if   VECT_SIZE == 1
 #define BOX1(i,S) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32 *iv, u32 *data, u32 *Kc, u32 *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -500,8 +501,8 @@ KERNEL_FQ void m14000_mxx (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -524,7 +525,7 @@ KERNEL_FQ void m14000_mxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -648,8 +649,8 @@ KERNEL_FQ void m14000_sxx (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -672,7 +673,7 @@ KERNEL_FQ void m14000_sxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14000_a3-pure.cl b/OpenCL/m14000_a3-pure.cl
index 1e9b51d26..7d1b33e8a 100644
--- a/OpenCL/m14000_a3-pure.cl
+++ b/OpenCL/m14000_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #endif
 
@@ -1730,7 +1731,7 @@ DECLSPEC void transpose32c (u32 *data)
 // transpose bitslice mod : attention race conditions, need different buffers for *in and *out
 //
 
-KERNEL_FQ void m14000_tm (GLOBAL_AS u32 *mod, GLOBAL_AS bs_word_t *words_buf_r)
+KERNEL_FQ void m14000_tm (GLOBAL_AS u32 *mod, GLOBAL_AS bs_word_t *words_buf_b)
 {
   const u64 gid = get_global_id (0);
 
@@ -1744,13 +1745,13 @@ KERNEL_FQ void m14000_tm (GLOBAL_AS u32 *mod, GLOBAL_AS bs_word_t *words_buf_r)
   #endif
   for (int i = 0, j = 0; i < 32; i += 8, j += 7)
   {
-    atomic_or (&words_buf_r[block].b[j + 0], (((w0 >> (i + 7)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 1], (((w0 >> (i + 6)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 2], (((w0 >> (i + 5)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 3], (((w0 >> (i + 4)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 4], (((w0 >> (i + 3)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 5], (((w0 >> (i + 2)) & 1) << slice));
-    atomic_or (&words_buf_r[block].b[j + 6], (((w0 >> (i + 1)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 0], (((w0 >> (i + 7)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 1], (((w0 >> (i + 6)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 2], (((w0 >> (i + 5)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 3], (((w0 >> (i + 4)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 4], (((w0 >> (i + 3)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 5], (((w0 >> (i + 2)) & 1) << slice));
+    atomic_or (&words_buf_b[block].b[j + 6], (((w0 >> (i + 1)) & 1) << slice));
   }
 }
 
@@ -2042,7 +2043,11 @@ KERNEL_FQ void m14000_mxx (KERN_ATTR_BITSLICE ())
    * inner loop
    */
 
+  #ifdef IS_CUDA
+  const u32 pc_pos = (blockIdx.y * blockDim.y) + threadIdx.y;
+  #else
   const u32 pc_pos = get_global_id (1);
+  #endif
 
   const u32 il_pos = pc_pos * 32;
 
@@ -2075,34 +2080,34 @@ KERNEL_FQ void m14000_mxx (KERN_ATTR_BITSLICE ())
   u32 k26 = K26;
   u32 k27 = K27;
 
-  k00 |= words_buf_r[pc_pos].b[ 0];
-  k01 |= words_buf_r[pc_pos].b[ 1];
-  k02 |= words_buf_r[pc_pos].b[ 2];
-  k03 |= words_buf_r[pc_pos].b[ 3];
-  k04 |= words_buf_r[pc_pos].b[ 4];
-  k05 |= words_buf_r[pc_pos].b[ 5];
-  k06 |= words_buf_r[pc_pos].b[ 6];
-  k07 |= words_buf_r[pc_pos].b[ 7];
-  k08 |= words_buf_r[pc_pos].b[ 8];
-  k09 |= words_buf_r[pc_pos].b[ 9];
-  k10 |= words_buf_r[pc_pos].b[10];
-  k11 |= words_buf_r[pc_pos].b[11];
-  k12 |= words_buf_r[pc_pos].b[12];
-  k13 |= words_buf_r[pc_pos].b[13];
-  k14 |= words_buf_r[pc_pos].b[14];
-  k15 |= words_buf_r[pc_pos].b[15];
-  k16 |= words_buf_r[pc_pos].b[16];
-  k17 |= words_buf_r[pc_pos].b[17];
-  k18 |= words_buf_r[pc_pos].b[18];
-  k19 |= words_buf_r[pc_pos].b[19];
-  k20 |= words_buf_r[pc_pos].b[20];
-  k21 |= words_buf_r[pc_pos].b[21];
-  k22 |= words_buf_r[pc_pos].b[22];
-  k23 |= words_buf_r[pc_pos].b[23];
-  k24 |= words_buf_r[pc_pos].b[24];
-  k25 |= words_buf_r[pc_pos].b[25];
-  k26 |= words_buf_r[pc_pos].b[26];
-  k27 |= words_buf_r[pc_pos].b[27];
+  k00 |= words_buf_s[pc_pos].b[ 0];
+  k01 |= words_buf_s[pc_pos].b[ 1];
+  k02 |= words_buf_s[pc_pos].b[ 2];
+  k03 |= words_buf_s[pc_pos].b[ 3];
+  k04 |= words_buf_s[pc_pos].b[ 4];
+  k05 |= words_buf_s[pc_pos].b[ 5];
+  k06 |= words_buf_s[pc_pos].b[ 6];
+  k07 |= words_buf_s[pc_pos].b[ 7];
+  k08 |= words_buf_s[pc_pos].b[ 8];
+  k09 |= words_buf_s[pc_pos].b[ 9];
+  k10 |= words_buf_s[pc_pos].b[10];
+  k11 |= words_buf_s[pc_pos].b[11];
+  k12 |= words_buf_s[pc_pos].b[12];
+  k13 |= words_buf_s[pc_pos].b[13];
+  k14 |= words_buf_s[pc_pos].b[14];
+  k15 |= words_buf_s[pc_pos].b[15];
+  k16 |= words_buf_s[pc_pos].b[16];
+  k17 |= words_buf_s[pc_pos].b[17];
+  k18 |= words_buf_s[pc_pos].b[18];
+  k19 |= words_buf_s[pc_pos].b[19];
+  k20 |= words_buf_s[pc_pos].b[20];
+  k21 |= words_buf_s[pc_pos].b[21];
+  k22 |= words_buf_s[pc_pos].b[22];
+  k23 |= words_buf_s[pc_pos].b[23];
+  k24 |= words_buf_s[pc_pos].b[24];
+  k25 |= words_buf_s[pc_pos].b[25];
+  k26 |= words_buf_s[pc_pos].b[26];
+  k27 |= words_buf_s[pc_pos].b[27];
 
   DES
   (
@@ -2554,7 +2559,11 @@ KERNEL_FQ void m14000_sxx (KERN_ATTR_BITSLICE ())
    * inner loop
    */
 
+  #ifdef IS_CUDA
+  const u32 pc_pos = (blockIdx.y * blockDim.y) + threadIdx.y;
+  #else
   const u32 pc_pos = get_global_id (1);
+  #endif
 
   const u32 il_pos = pc_pos * 32;
 
@@ -2587,34 +2596,34 @@ KERNEL_FQ void m14000_sxx (KERN_ATTR_BITSLICE ())
   u32 k26 = K26;
   u32 k27 = K27;
 
-  k00 |= words_buf_r[pc_pos].b[ 0];
-  k01 |= words_buf_r[pc_pos].b[ 1];
-  k02 |= words_buf_r[pc_pos].b[ 2];
-  k03 |= words_buf_r[pc_pos].b[ 3];
-  k04 |= words_buf_r[pc_pos].b[ 4];
-  k05 |= words_buf_r[pc_pos].b[ 5];
-  k06 |= words_buf_r[pc_pos].b[ 6];
-  k07 |= words_buf_r[pc_pos].b[ 7];
-  k08 |= words_buf_r[pc_pos].b[ 8];
-  k09 |= words_buf_r[pc_pos].b[ 9];
-  k10 |= words_buf_r[pc_pos].b[10];
-  k11 |= words_buf_r[pc_pos].b[11];
-  k12 |= words_buf_r[pc_pos].b[12];
-  k13 |= words_buf_r[pc_pos].b[13];
-  k14 |= words_buf_r[pc_pos].b[14];
-  k15 |= words_buf_r[pc_pos].b[15];
-  k16 |= words_buf_r[pc_pos].b[16];
-  k17 |= words_buf_r[pc_pos].b[17];
-  k18 |= words_buf_r[pc_pos].b[18];
-  k19 |= words_buf_r[pc_pos].b[19];
-  k20 |= words_buf_r[pc_pos].b[20];
-  k21 |= words_buf_r[pc_pos].b[21];
-  k22 |= words_buf_r[pc_pos].b[22];
-  k23 |= words_buf_r[pc_pos].b[23];
-  k24 |= words_buf_r[pc_pos].b[24];
-  k25 |= words_buf_r[pc_pos].b[25];
-  k26 |= words_buf_r[pc_pos].b[26];
-  k27 |= words_buf_r[pc_pos].b[27];
+  k00 |= words_buf_s[pc_pos].b[ 0];
+  k01 |= words_buf_s[pc_pos].b[ 1];
+  k02 |= words_buf_s[pc_pos].b[ 2];
+  k03 |= words_buf_s[pc_pos].b[ 3];
+  k04 |= words_buf_s[pc_pos].b[ 4];
+  k05 |= words_buf_s[pc_pos].b[ 5];
+  k06 |= words_buf_s[pc_pos].b[ 6];
+  k07 |= words_buf_s[pc_pos].b[ 7];
+  k08 |= words_buf_s[pc_pos].b[ 8];
+  k09 |= words_buf_s[pc_pos].b[ 9];
+  k10 |= words_buf_s[pc_pos].b[10];
+  k11 |= words_buf_s[pc_pos].b[11];
+  k12 |= words_buf_s[pc_pos].b[12];
+  k13 |= words_buf_s[pc_pos].b[13];
+  k14 |= words_buf_s[pc_pos].b[14];
+  k15 |= words_buf_s[pc_pos].b[15];
+  k16 |= words_buf_s[pc_pos].b[16];
+  k17 |= words_buf_s[pc_pos].b[17];
+  k18 |= words_buf_s[pc_pos].b[18];
+  k19 |= words_buf_s[pc_pos].b[19];
+  k20 |= words_buf_s[pc_pos].b[20];
+  k21 |= words_buf_s[pc_pos].b[21];
+  k22 |= words_buf_s[pc_pos].b[22];
+  k23 |= words_buf_s[pc_pos].b[23];
+  k24 |= words_buf_s[pc_pos].b[24];
+  k25 |= words_buf_s[pc_pos].b[25];
+  k26 |= words_buf_s[pc_pos].b[26];
+  k27 |= words_buf_s[pc_pos].b[27];
 
   DES
   (
diff --git a/OpenCL/m14100_a0-pure.cl b/OpenCL/m14100_a0-pure.cl
index 99ffdb703..780b7fae2 100644
--- a/OpenCL/m14100_a0-pure.cl
+++ b/OpenCL/m14100_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -52,7 +53,7 @@
   PERM_OP (l, r, tt,  4, 0x0f0f0f0f);  \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     /* nibble 0 */
@@ -208,7 +209,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -359,25 +360,25 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #if   VECT_SIZE == 1
 #define BOX1(i,S) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32 *iv, u32 *data, u32 *Kc, u32 *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -554,8 +555,8 @@ KERNEL_FQ void m14100_mxx (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -578,7 +579,7 @@ KERNEL_FQ void m14100_mxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -689,8 +690,8 @@ KERNEL_FQ void m14100_sxx (KERN_ATTR_RULES ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -713,7 +714,7 @@ KERNEL_FQ void m14100_sxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14100_a1-pure.cl b/OpenCL/m14100_a1-pure.cl
index aa861b19f..923e3bf61 100644
--- a/OpenCL/m14100_a1-pure.cl
+++ b/OpenCL/m14100_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
@@ -50,7 +51,7 @@
   PERM_OP (l, r, tt,  4, 0x0f0f0f0f);  \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -198,7 +199,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -349,25 +350,25 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #if   VECT_SIZE == 1
 #define BOX1(i,S) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32 *iv, u32 *data, u32 *Kc, u32 *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -544,8 +545,8 @@ KERNEL_FQ void m14100_mxx (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -568,7 +569,7 @@ KERNEL_FQ void m14100_mxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -727,8 +728,8 @@ KERNEL_FQ void m14100_sxx (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -751,7 +752,7 @@ KERNEL_FQ void m14100_sxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14100_a3-pure.cl b/OpenCL/m14100_a3-pure.cl
index 1943413a8..4b46cf5e8 100644
--- a/OpenCL/m14100_a3-pure.cl
+++ b/OpenCL/m14100_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
@@ -50,7 +51,7 @@
   PERM_OP (l, r, tt,  4, 0x0f0f0f0f);  \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x02080800, 0x00080000, 0x02000002, 0x02080802,
@@ -198,7 +199,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   }
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -349,25 +350,25 @@ CONSTANT_AS u32a c_skb[8][64] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 #if   VECT_SIZE == 1
 #define BOX1(i,S) (S)[(i)]
 #elif VECT_SIZE == 2
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1])
 #elif VECT_SIZE == 4
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3])
 #elif VECT_SIZE == 8
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7])
 #elif VECT_SIZE == 16
-#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
+#define BOX1(i,S) make_u32x ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_encrypt (u32x *iv, u32x *data, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_SPtrans)[64])
@@ -726,8 +727,8 @@ KERNEL_FQ void m14100_mxx (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -750,7 +751,7 @@ KERNEL_FQ void m14100_mxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -800,8 +801,8 @@ KERNEL_FQ void m14100_sxx (KERN_ATTR_BASIC ())
    * shared
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -824,7 +825,7 @@ KERNEL_FQ void m14100_sxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14400_a0-optimized.cl b/OpenCL/m14400_a0-optimized.cl
index f6934b1d0..3f976a056 100644
--- a/OpenCL/m14400_a0-optimized.cl
+++ b/OpenCL/m14400_a0-optimized.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -14,15 +15,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void append_4 (const u32 offset, u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 src_r0)
@@ -131,7 +132,7 @@ KERNEL_FQ void m14400_m04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -142,7 +143,7 @@ KERNEL_FQ void m14400_m04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -402,7 +403,7 @@ KERNEL_FQ void m14400_s04 (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -413,7 +414,7 @@ KERNEL_FQ void m14400_s04 (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14400_a0-pure.cl b/OpenCL/m14400_a0-pure.cl
index 9271e5ae0..f57dbe814 100644
--- a/OpenCL/m14400_a0-pure.cl
+++ b/OpenCL/m14400_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -16,15 +17,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m14400_mxx (KERN_ATTR_RULES ())
@@ -41,7 +42,7 @@ KERNEL_FQ void m14400_mxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -52,7 +53,7 @@ KERNEL_FQ void m14400_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -288,7 +289,7 @@ KERNEL_FQ void m14400_sxx (KERN_ATTR_RULES ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -299,7 +300,7 @@ KERNEL_FQ void m14400_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14400_a1-optimized.cl b/OpenCL/m14400_a1-optimized.cl
index a9259b61c..3bbabbe4b 100644
--- a/OpenCL/m14400_a1-optimized.cl
+++ b/OpenCL/m14400_a1-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void append_4 (const u32 offset, u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 src_r0)
@@ -131,7 +132,7 @@ KERNEL_FQ void m14400_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -142,7 +143,7 @@ KERNEL_FQ void m14400_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -466,7 +467,7 @@ KERNEL_FQ void m14400_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -477,7 +478,7 @@ KERNEL_FQ void m14400_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14400_a1-pure.cl b/OpenCL/m14400_a1-pure.cl
index 88bf46bad..77d5c7e8a 100644
--- a/OpenCL/m14400_a1-pure.cl
+++ b/OpenCL/m14400_a1-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m14400_mxx (KERN_ATTR_BASIC ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m14400_mxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m14400_mxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -284,7 +285,7 @@ KERNEL_FQ void m14400_sxx (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -295,7 +296,7 @@ KERNEL_FQ void m14400_sxx (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14400_a3-optimized.cl b/OpenCL/m14400_a3-optimized.cl
index 07938f501..e30be7856 100644
--- a/OpenCL/m14400_a3-optimized.cl
+++ b/OpenCL/m14400_a3-optimized.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 DECLSPEC void append_4 (const u32 offset, u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 src_r0)
@@ -641,7 +642,7 @@ KERNEL_FQ void m14400_m04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -652,7 +653,7 @@ KERNEL_FQ void m14400_m04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -711,7 +712,7 @@ KERNEL_FQ void m14400_m08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -722,7 +723,7 @@ KERNEL_FQ void m14400_m08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -781,7 +782,7 @@ KERNEL_FQ void m14400_m16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -792,7 +793,7 @@ KERNEL_FQ void m14400_m16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -851,7 +852,7 @@ KERNEL_FQ void m14400_s04 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -862,7 +863,7 @@ KERNEL_FQ void m14400_s04 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -921,7 +922,7 @@ KERNEL_FQ void m14400_s08 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -932,7 +933,7 @@ KERNEL_FQ void m14400_s08 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -991,7 +992,7 @@ KERNEL_FQ void m14400_s16 (KERN_ATTR_BASIC ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -1002,7 +1003,7 @@ KERNEL_FQ void m14400_s16 (KERN_ATTR_BASIC ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14400_a3-pure.cl b/OpenCL/m14400_a3-pure.cl
index 7976d7bc5..324cf988e 100644
--- a/OpenCL/m14400_a3-pure.cl
+++ b/OpenCL/m14400_a3-pure.cl
@@ -8,21 +8,22 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m14400_mxx (KERN_ATTR_VECTOR ())
@@ -39,7 +40,7 @@ KERNEL_FQ void m14400_mxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -50,7 +51,7 @@ KERNEL_FQ void m14400_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -309,7 +310,7 @@ KERNEL_FQ void m14400_sxx (KERN_ATTR_VECTOR ())
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -320,7 +321,7 @@ KERNEL_FQ void m14400_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14611-pure.cl b/OpenCL/m14611-pure.cl
index 3acddaf75..6869390c2 100644
--- a/OpenCL/m14611-pure.cl
+++ b/OpenCL/m14611-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -303,17 +304,17 @@ KERNEL_FQ void m14611_comp (KERN_ATTR_TMPS_ESALT (luks_tmp_t, luks_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -330,7 +331,7 @@ KERNEL_FQ void m14611_comp (KERN_ATTR_TMPS_ESALT (luks_tmp_t, luks_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m14612-pure.cl b/OpenCL/m14612-pure.cl
index b131dfa01..1ba6880c9 100644
--- a/OpenCL/m14612-pure.cl
+++ b/OpenCL/m14612-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m14613-pure.cl b/OpenCL/m14613-pure.cl
index f3e07ee90..de1ee77b3 100644
--- a/OpenCL/m14613-pure.cl
+++ b/OpenCL/m14613-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m14621-pure.cl b/OpenCL/m14621-pure.cl
index 058c6e8a2..69d0f8582 100644
--- a/OpenCL/m14621-pure.cl
+++ b/OpenCL/m14621-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -342,17 +343,17 @@ KERNEL_FQ void m14621_comp (KERN_ATTR_TMPS_ESALT (luks_tmp_t, luks_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -369,7 +370,7 @@ KERNEL_FQ void m14621_comp (KERN_ATTR_TMPS_ESALT (luks_tmp_t, luks_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m14622-pure.cl b/OpenCL/m14622-pure.cl
index 0d136b0c8..565060adc 100644
--- a/OpenCL/m14622-pure.cl
+++ b/OpenCL/m14622-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m14623-pure.cl b/OpenCL/m14623-pure.cl
index da5a03de2..81066c19b 100644
--- a/OpenCL/m14623-pure.cl
+++ b/OpenCL/m14623-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m14631-pure.cl b/OpenCL/m14631-pure.cl
index 99da8e3fb..d85572cb6 100644
--- a/OpenCL/m14631-pure.cl
+++ b/OpenCL/m14631-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -398,17 +399,17 @@ KERNEL_FQ void m14631_comp (KERN_ATTR_TMPS_ESALT (luks_tmp_t, luks_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -425,7 +426,7 @@ KERNEL_FQ void m14631_comp (KERN_ATTR_TMPS_ESALT (luks_tmp_t, luks_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m14632-pure.cl b/OpenCL/m14632-pure.cl
index 7e036f33c..0e097ba13 100644
--- a/OpenCL/m14632-pure.cl
+++ b/OpenCL/m14632-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m14633-pure.cl b/OpenCL/m14633-pure.cl
index 976be1605..c019f23f0 100644
--- a/OpenCL/m14633-pure.cl
+++ b/OpenCL/m14633-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m14641-pure.cl b/OpenCL/m14641-pure.cl
index 4007b8a10..19e9829c0 100644
--- a/OpenCL/m14641-pure.cl
+++ b/OpenCL/m14641-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -303,17 +304,17 @@ KERNEL_FQ void m14641_comp (KERN_ATTR_TMPS_ESALT (luks_tmp_t, luks_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -330,7 +331,7 @@ KERNEL_FQ void m14641_comp (KERN_ATTR_TMPS_ESALT (luks_tmp_t, luks_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m14642-pure.cl b/OpenCL/m14642-pure.cl
index ca8459c58..b5fd5441a 100644
--- a/OpenCL/m14642-pure.cl
+++ b/OpenCL/m14642-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m14643-pure.cl b/OpenCL/m14643-pure.cl
index 4fbb4df32..34ab7d4d1 100644
--- a/OpenCL/m14643-pure.cl
+++ b/OpenCL/m14643-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m14700-pure.cl b/OpenCL/m14700-pure.cl
index fb3673d67..fab345353 100644
--- a/OpenCL/m14700-pure.cl
+++ b/OpenCL/m14700-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -239,17 +240,17 @@ KERNEL_FQ void m14700_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha1_tmp_t, itunes_back
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -266,7 +267,7 @@ KERNEL_FQ void m14700_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha1_tmp_t, itunes_back
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m14800-pure.cl b/OpenCL/m14800-pure.cl
index 578fe9716..93495bfbb 100644
--- a/OpenCL/m14800-pure.cl
+++ b/OpenCL/m14800-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -503,17 +504,17 @@ KERNEL_FQ void m14800_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, itunes_ba
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -530,7 +531,7 @@ KERNEL_FQ void m14800_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, itunes_ba
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m14900_a0-optimized.cl b/OpenCL/m14900_a0-optimized.cl
index 36ca69d9c..f2d6f4c98 100644
--- a/OpenCL/m14900_a0-optimized.cl
+++ b/OpenCL/m14900_a0-optimized.cl
@@ -9,13 +9,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u8a c_ftable[256] =
+CONSTANT_VK u8a c_ftable[256] =
 {
   0xa3, 0xd7, 0x09, 0x83, 0xf8, 0x48, 0xf6, 0xf4,
   0xb3, 0x21, 0x15, 0x78, 0x99, 0xb1, 0xaf, 0xf9,
@@ -118,14 +119,14 @@ KERNEL_FQ void m14900_m04 (KERN_ATTR_RULES ())
    * s_ftable
    */
 
-  LOCAL_AS u8 s_ftable[256];
+  LOCAL_VK u8 s_ftable[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_ftable[i] = c_ftable[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -209,14 +210,14 @@ KERNEL_FQ void m14900_s04 (KERN_ATTR_RULES ())
    * s_ftable
    */
 
-  LOCAL_AS u8 s_ftable[256];
+  LOCAL_VK u8 s_ftable[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_ftable[i] = c_ftable[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14900_a1-optimized.cl b/OpenCL/m14900_a1-optimized.cl
index 42c7dd6c9..14722335c 100644
--- a/OpenCL/m14900_a1-optimized.cl
+++ b/OpenCL/m14900_a1-optimized.cl
@@ -9,11 +9,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u8a c_ftable[256] =
+CONSTANT_VK u8a c_ftable[256] =
 {
   0xa3, 0xd7, 0x09, 0x83, 0xf8, 0x48, 0xf6, 0xf4,
   0xb3, 0x21, 0x15, 0x78, 0x99, 0xb1, 0xaf, 0xf9,
@@ -116,14 +117,14 @@ KERNEL_FQ void m14900_m04 (KERN_ATTR_BASIC ())
    * s_ftable
    */
 
-  LOCAL_AS u8 s_ftable[256];
+  LOCAL_VK u8 s_ftable[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_ftable[i] = c_ftable[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -271,14 +272,14 @@ KERNEL_FQ void m14900_s04 (KERN_ATTR_BASIC ())
    * s_ftable
    */
 
-  LOCAL_AS u8 s_ftable[256];
+  LOCAL_VK u8 s_ftable[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_ftable[i] = c_ftable[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m14900_a3-optimized.cl b/OpenCL/m14900_a3-optimized.cl
index b8a7ce4b1..bd89b8874 100644
--- a/OpenCL/m14900_a3-optimized.cl
+++ b/OpenCL/m14900_a3-optimized.cl
@@ -9,11 +9,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u8a c_ftable[256] =
+CONSTANT_VK u8a c_ftable[256] =
 {
   0xa3, 0xd7, 0x09, 0x83, 0xf8, 0x48, 0xf6, 0xf4,
   0xb3, 0x21, 0x15, 0x78, 0x99, 0xb1, 0xaf, 0xf9,
@@ -224,14 +225,14 @@ KERNEL_FQ void m14900_m04 (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u8 s_ftable[256];
+  LOCAL_VK u8 s_ftable[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_ftable[i] = c_ftable[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -298,14 +299,14 @@ KERNEL_FQ void m14900_s04 (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u8 s_ftable[256];
+  LOCAL_VK u8 s_ftable[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
     s_ftable[i] = c_ftable[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m15000_a0-optimized.cl b/OpenCL/m15000_a0-optimized.cl
index 428997e98..a4fe67e48 100644
--- a/OpenCL/m15000_a0-optimized.cl
+++ b/OpenCL/m15000_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m15000_a0-pure.cl b/OpenCL/m15000_a0-pure.cl
index eacbad4d3..9591d5555 100644
--- a/OpenCL/m15000_a0-pure.cl
+++ b/OpenCL/m15000_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m15000_a1-optimized.cl b/OpenCL/m15000_a1-optimized.cl
index 7257d2c19..e410b3102 100644
--- a/OpenCL/m15000_a1-optimized.cl
+++ b/OpenCL/m15000_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m15000_a1-pure.cl b/OpenCL/m15000_a1-pure.cl
index 54a3c6d98..6ec8d1576 100644
--- a/OpenCL/m15000_a1-pure.cl
+++ b/OpenCL/m15000_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m15000_a3-optimized.cl b/OpenCL/m15000_a3-optimized.cl
index 0021f624d..5ff6d7bfb 100644
--- a/OpenCL/m15000_a3-optimized.cl
+++ b/OpenCL/m15000_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m15000_a3-pure.cl b/OpenCL/m15000_a3-pure.cl
index a15aa67d6..9e652f284 100644
--- a/OpenCL/m15000_a3-pure.cl
+++ b/OpenCL/m15000_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m15100-pure.cl b/OpenCL/m15100-pure.cl
index 18756cfa4..7b09ec0be 100644
--- a/OpenCL/m15100-pure.cl
+++ b/OpenCL/m15100-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m15300-pure.cl b/OpenCL/m15300-pure.cl
index b1c8fb0b4..9a2b01cf3 100644
--- a/OpenCL/m15300-pure.cl
+++ b/OpenCL/m15300-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -383,8 +384,8 @@ KERNEL_FQ void m15300_comp (KERN_ATTR_TMPS_ESALT (dpapimk_tmp_v1_t, dpapimk_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -407,7 +408,7 @@ KERNEL_FQ void m15300_comp (KERN_ATTR_TMPS_ESALT (dpapimk_tmp_v1_t, dpapimk_t))
     s_skb[7][i] = c_skb[7][i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m15400_a0-optimized.cl b/OpenCL/m15400_a0-optimized.cl
index a24157254..294e086e0 100644
--- a/OpenCL/m15400_a0-optimized.cl
+++ b/OpenCL/m15400_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m15400_a1-optimized.cl b/OpenCL/m15400_a1-optimized.cl
index 7508ce935..94de59707 100644
--- a/OpenCL/m15400_a1-optimized.cl
+++ b/OpenCL/m15400_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
diff --git a/OpenCL/m15400_a3-optimized.cl b/OpenCL/m15400_a3-optimized.cl
index ae57ac965..d2a6d962d 100644
--- a/OpenCL/m15400_a3-optimized.cl
+++ b/OpenCL/m15400_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
diff --git a/OpenCL/m15500_a0-optimized.cl b/OpenCL/m15500_a0-optimized.cl
index 6644adc51..bc5ae795c 100644
--- a/OpenCL/m15500_a0-optimized.cl
+++ b/OpenCL/m15500_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m15500_a0-pure.cl b/OpenCL/m15500_a0-pure.cl
index eb39484fe..7286c3b64 100644
--- a/OpenCL/m15500_a0-pure.cl
+++ b/OpenCL/m15500_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m15500_a1-optimized.cl b/OpenCL/m15500_a1-optimized.cl
index 116bf8485..e64b33a60 100644
--- a/OpenCL/m15500_a1-optimized.cl
+++ b/OpenCL/m15500_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m15500_a1-pure.cl b/OpenCL/m15500_a1-pure.cl
index f6555c5e0..e93c0838e 100644
--- a/OpenCL/m15500_a1-pure.cl
+++ b/OpenCL/m15500_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m15500_a3-optimized.cl b/OpenCL/m15500_a3-optimized.cl
index 62c352eea..ad20fcf90 100644
--- a/OpenCL/m15500_a3-optimized.cl
+++ b/OpenCL/m15500_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m15500_a3-pure.cl b/OpenCL/m15500_a3-pure.cl
index 7c35074be..1cecef4b8 100644
--- a/OpenCL/m15500_a3-pure.cl
+++ b/OpenCL/m15500_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m15600-pure.cl b/OpenCL/m15600-pure.cl
index 950840f57..e6bad07b5 100644
--- a/OpenCL/m15600-pure.cl
+++ b/OpenCL/m15600-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -33,7 +34,7 @@ typedef struct ethereum_pbkdf2
 #define COMPARE_S "inc_comp_single.cl"
 #define COMPARE_M "inc_comp_multi.cl"
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m15700-pure.cl b/OpenCL/m15700-pure.cl
index ac3e849a9..2bd7b709c 100644
--- a/OpenCL/m15700-pure.cl
+++ b/OpenCL/m15700-pure.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_sha256.cl"
 #endif
@@ -23,6 +24,23 @@ typedef struct
 
 } scrypt_tmp_t;
 
+#ifdef IS_CUDA
+
+inline __device__ uint4 operator &  (const uint4  a, const u32   b) { return make_uint4 ((a.x &  b  ), (a.y &  b  ), (a.z &  b  ), (a.w &  b  ));  }
+inline __device__ uint4 operator << (const uint4  a, const u32   b) { return make_uint4 ((a.x << b  ), (a.y << b  ), (a.z << b  ), (a.w << b  ));  }
+inline __device__ uint4 operator >> (const uint4  a, const u32   b) { return make_uint4 ((a.x >> b  ), (a.y >> b  ), (a.z >> b  ), (a.w >> b  ));  }
+inline __device__ uint4 operator +  (const uint4  a, const uint4 b) { return make_uint4 ((a.x +  b.x), (a.y +  b.y), (a.z +  b.z), (a.w +  b.w));  }
+inline __device__ uint4 operator ^  (const uint4  a, const uint4 b) { return make_uint4 ((a.x ^  b.x), (a.y ^  b.y), (a.z ^  b.z), (a.w ^  b.w));  }
+inline __device__ uint4 operator |  (const uint4  a, const uint4 b) { return make_uint4 ((a.x |  b.x), (a.y |  b.y), (a.z |  b.z), (a.w |  b.w));  }
+inline __device__ uint4 operator ^= (      uint4 &a, const uint4 b) {                     a.x ^= b.x;   a.y ^= b.y;   a.z ^= b.z;   a.w ^= b.w;    }
+
+inline __device__ uint4 rotate (const uint4 a, const int n)
+{
+  return ((a >> n) | ((a >> (32 - n))));
+}
+
+#endif
+
 typedef struct ethereum_scrypt
 {
   u32 salt_buf[16];
@@ -46,26 +64,50 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
 
 #define ADD_ROTATE_XOR(r,i1,i2,s) (r) ^= rotate ((i1) + (i2), (s));
 
-#define SALSA20_2R()                \
-{                                   \
-  ADD_ROTATE_XOR (X1, X0, X3,  7);  \
-  ADD_ROTATE_XOR (X2, X1, X0,  9);  \
-  ADD_ROTATE_XOR (X3, X2, X1, 13);  \
-  ADD_ROTATE_XOR (X0, X3, X2, 18);  \
-                                    \
-  X1 = X1.s3012;                    \
-  X2 = X2.s2301;                    \
-  X3 = X3.s1230;                    \
-                                    \
-  ADD_ROTATE_XOR (X3, X0, X1,  7);  \
-  ADD_ROTATE_XOR (X2, X3, X0,  9);  \
-  ADD_ROTATE_XOR (X1, X2, X3, 13);  \
-  ADD_ROTATE_XOR (X0, X1, X2, 18);  \
-                                    \
-  X1 = X1.s1230;                    \
-  X2 = X2.s2301;                    \
-  X3 = X3.s3012;                    \
+#ifdef IS_CUDA
+
+#define SALSA20_2R()                        \
+{                                           \
+  ADD_ROTATE_XOR (X1, X0, X3,  7);          \
+  ADD_ROTATE_XOR (X2, X1, X0,  9);          \
+  ADD_ROTATE_XOR (X3, X2, X1, 13);          \
+  ADD_ROTATE_XOR (X0, X3, X2, 18);          \
+                                            \
+  X1 = make_uint4 (X1.w, X1.x, X1.y, X1.z); \
+  X2 = make_uint4 (X2.z, X2.w, X2.x, X2.y); \
+  X3 = make_uint4 (X3.y, X3.z, X3.w, X3.x); \
+                                            \
+  ADD_ROTATE_XOR (X3, X0, X1,  7);          \
+  ADD_ROTATE_XOR (X2, X3, X0,  9);          \
+  ADD_ROTATE_XOR (X1, X2, X3, 13);          \
+  ADD_ROTATE_XOR (X0, X1, X2, 18);          \
+                                            \
+  X1 = make_uint4 (X1.y, X1.z, X1.w, X1.x); \
+  X2 = make_uint4 (X2.z, X2.w, X2.x, X2.y); \
+  X3 = make_uint4 (X3.w, X3.x, X3.y, X3.z); \
 }
+#else
+#define SALSA20_2R()                        \
+{                                           \
+  ADD_ROTATE_XOR (X1, X0, X3,  7);          \
+  ADD_ROTATE_XOR (X2, X1, X0,  9);          \
+  ADD_ROTATE_XOR (X3, X2, X1, 13);          \
+  ADD_ROTATE_XOR (X0, X3, X2, 18);          \
+                                            \
+  X1 = X1.s3012;                            \
+  X2 = X2.s2301;                            \
+  X3 = X3.s1230;                            \
+                                            \
+  ADD_ROTATE_XOR (X3, X0, X1,  7);          \
+  ADD_ROTATE_XOR (X2, X3, X0,  9);          \
+  ADD_ROTATE_XOR (X1, X2, X3, 13);          \
+  ADD_ROTATE_XOR (X0, X1, X2, 18);          \
+                                            \
+  X1 = X1.s1230;                            \
+  X2 = X2.s2301;                            \
+  X3 = X3.s3012;                            \
+}
+#endif
 
 #define SALSA20_8_XOR() \
 {                       \
@@ -170,10 +212,17 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
   #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #else
     T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
     T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
     T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
     T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #endif
 
     X[i + 0] = T[0];
     X[i + 1] = T[1];
@@ -210,10 +259,17 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
   #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #else
     T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
     T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
     T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
     T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #endif
 
     X[i + 0] = T[0];
     X[i + 1] = T[1];
@@ -260,7 +316,7 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
   st[4 + s] ^= ~bc0 & bc1;      \
 }
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
@@ -410,8 +466,13 @@ KERNEL_FQ void m15700_init (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
     digest[6] = sha256_hmac_ctx2.opad.h[6];
     digest[7] = sha256_hmac_ctx2.opad.h[7];
 
+    #ifdef IS_CUDA
+    const uint4 tmp0 = make_uint4 (digest[0], digest[1], digest[2], digest[3]);
+    const uint4 tmp1 = make_uint4 (digest[4], digest[5], digest[6], digest[7]);
+    #else
     const uint4 tmp0 = (uint4) (digest[0], digest[1], digest[2], digest[3]);
     const uint4 tmp1 = (uint4) (digest[4], digest[5], digest[6], digest[7]);
+    #endif
 
     tmps[gid].P[k + 0] = tmp0;
     tmps[gid].P[k + 1] = tmp1;
@@ -424,10 +485,10 @@ KERNEL_FQ void m15700_loop (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
 
   if (gid >= gid_max) return;
 
-  GLOBAL_AS uint4 *d_scrypt0_buf = d_extra0_buf;
-  GLOBAL_AS uint4 *d_scrypt1_buf = d_extra1_buf;
-  GLOBAL_AS uint4 *d_scrypt2_buf = d_extra2_buf;
-  GLOBAL_AS uint4 *d_scrypt3_buf = d_extra3_buf;
+  GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf;
+  GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf;
+  GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf;
+  GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf;
 
   uint4 X[STATE_CNT4];
   uint4 T[STATE_CNT4];
@@ -486,31 +547,31 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
 
     tmp = tmps[gid].P[l + 0];
 
-    w0[0] = tmp.s0;
-    w0[1] = tmp.s1;
-    w0[2] = tmp.s2;
-    w0[3] = tmp.s3;
+    w0[0] = tmp.x;
+    w0[1] = tmp.y;
+    w0[2] = tmp.z;
+    w0[3] = tmp.w;
 
     tmp = tmps[gid].P[l + 1];
 
-    w1[0] = tmp.s0;
-    w1[1] = tmp.s1;
-    w1[2] = tmp.s2;
-    w1[3] = tmp.s3;
+    w1[0] = tmp.x;
+    w1[1] = tmp.y;
+    w1[2] = tmp.z;
+    w1[3] = tmp.w;
 
     tmp = tmps[gid].P[l + 2];
 
-    w2[0] = tmp.s0;
-    w2[1] = tmp.s1;
-    w2[2] = tmp.s2;
-    w2[3] = tmp.s3;
+    w2[0] = tmp.x;
+    w2[1] = tmp.y;
+    w2[2] = tmp.z;
+    w2[3] = tmp.w;
 
     tmp = tmps[gid].P[l + 3];
 
-    w3[0] = tmp.s0;
-    w3[1] = tmp.s1;
-    w3[2] = tmp.s2;
-    w3[3] = tmp.s3;
+    w3[0] = tmp.x;
+    w3[1] = tmp.y;
+    w3[2] = tmp.z;
+    w3[3] = tmp.w;
 
     sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
   }
diff --git a/OpenCL/m15900-pure.cl b/OpenCL/m15900-pure.cl
index dbb59c80a..2c2dd5adb 100644
--- a/OpenCL/m15900-pure.cl
+++ b/OpenCL/m15900-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -512,17 +513,17 @@ KERNEL_FQ void m15900_comp (KERN_ATTR_TMPS_ESALT (dpapimk_tmp_v2_t, dpapimk_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -539,7 +540,7 @@ KERNEL_FQ void m15900_comp (KERN_ATTR_TMPS_ESALT (dpapimk_tmp_v2_t, dpapimk_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m16000_a0-pure.cl b/OpenCL/m16000_a0-pure.cl
index ffbc3a6e2..7f55f09b6 100644
--- a/OpenCL/m16000_a0-pure.cl
+++ b/OpenCL/m16000_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -34,7 +35,7 @@
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x00820200, 0x00020000, 0x80800000, 0x80820200,
@@ -182,7 +183,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -330,7 +331,7 @@ CONSTANT_AS u32a c_skb[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_tripcode_salt[128] =
+CONSTANT_VK u32a c_tripcode_salt[128] =
 {
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -345,13 +346,13 @@ CONSTANT_AS u32a c_tripcode_salt[128] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_keysetup (u32 c, u32x d, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_skb)[64])
@@ -507,8 +508,8 @@ KERNEL_FQ void m16000_mxx (KERN_ATTR_RULES ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -531,14 +532,14 @@ KERNEL_FQ void m16000_mxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  LOCAL_AS u32 s_tripcode_salt[128];
+  LOCAL_VK u32 s_tripcode_salt[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_tripcode_salt[i] = c_tripcode_salt[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -600,8 +601,8 @@ KERNEL_FQ void m16000_sxx (KERN_ATTR_RULES ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -624,14 +625,14 @@ KERNEL_FQ void m16000_sxx (KERN_ATTR_RULES ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  LOCAL_AS u32 s_tripcode_salt[128];
+  LOCAL_VK u32 s_tripcode_salt[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_tripcode_salt[i] = c_tripcode_salt[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m16000_a1-pure.cl b/OpenCL/m16000_a1-pure.cl
index c5af56fce..8c9900a0a 100644
--- a/OpenCL/m16000_a1-pure.cl
+++ b/OpenCL/m16000_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
@@ -32,7 +33,7 @@
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x00820200, 0x00020000, 0x80800000, 0x80820200,
@@ -180,7 +181,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -328,7 +329,7 @@ CONSTANT_AS u32a c_skb[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_tripcode_salt[128] =
+CONSTANT_VK u32a c_tripcode_salt[128] =
 {
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -343,13 +344,13 @@ CONSTANT_AS u32a c_tripcode_salt[128] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_keysetup (u32 c, u32x d, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_skb)[64])
@@ -505,8 +506,8 @@ KERNEL_FQ void m16000_mxx (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -529,14 +530,14 @@ KERNEL_FQ void m16000_mxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  LOCAL_AS u32 s_tripcode_salt[128];
+  LOCAL_VK u32 s_tripcode_salt[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_tripcode_salt[i] = c_tripcode_salt[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -677,8 +678,8 @@ KERNEL_FQ void m16000_sxx (KERN_ATTR_BASIC ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -701,14 +702,14 @@ KERNEL_FQ void m16000_sxx (KERN_ATTR_BASIC ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  LOCAL_AS u32 s_tripcode_salt[128];
+  LOCAL_VK u32 s_tripcode_salt[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_tripcode_salt[i] = c_tripcode_salt[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m16000_a3-pure.cl b/OpenCL/m16000_a3-pure.cl
index c7cb2311f..11d655e98 100644
--- a/OpenCL/m16000_a3-pure.cl
+++ b/OpenCL/m16000_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
@@ -32,7 +33,7 @@
   a  = a ^ tt;              \
 }
 
-CONSTANT_AS u32a c_SPtrans[8][64] =
+CONSTANT_VK u32a c_SPtrans[8][64] =
 {
   {
     0x00820200, 0x00020000, 0x80800000, 0x80820200,
@@ -180,7 +181,7 @@ CONSTANT_AS u32a c_SPtrans[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_skb[8][64] =
+CONSTANT_VK u32a c_skb[8][64] =
 {
   {
     0x00000000, 0x00000010, 0x20000000, 0x20000010,
@@ -328,7 +329,7 @@ CONSTANT_AS u32a c_skb[8][64] =
   },
 };
 
-CONSTANT_AS u32a c_tripcode_salt[128] =
+CONSTANT_VK u32a c_tripcode_salt[128] =
 {
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -343,13 +344,13 @@ CONSTANT_AS u32a c_tripcode_salt[128] =
 #if   VECT_SIZE == 1
 #define BOX(i,n,S) (S)[(n)][(i)]
 #elif VECT_SIZE == 2
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
 #elif VECT_SIZE == 4
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
 #elif VECT_SIZE == 8
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
 #elif VECT_SIZE == 16
-#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
+#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
 #endif
 
 DECLSPEC void _des_crypt_keysetup (u32 c, u32x d, u32x *Kc, u32x *Kd, LOCAL_AS u32 (*s_skb)[64])
@@ -505,8 +506,8 @@ KERNEL_FQ void m16000_mxx (KERN_ATTR_VECTOR ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -529,14 +530,14 @@ KERNEL_FQ void m16000_mxx (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  LOCAL_AS u32 s_tripcode_salt[128];
+  LOCAL_VK u32 s_tripcode_salt[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_tripcode_salt[i] = c_tripcode_salt[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -626,8 +627,8 @@ KERNEL_FQ void m16000_sxx (KERN_ATTR_VECTOR ())
    * sbox, kbox
    */
 
-  LOCAL_AS u32 s_SPtrans[8][64];
-  LOCAL_AS u32 s_skb[8][64];
+  LOCAL_VK u32 s_SPtrans[8][64];
+  LOCAL_VK u32 s_skb[8][64];
 
   for (u32 i = lid; i < 64; i += lsz)
   {
@@ -650,14 +651,14 @@ KERNEL_FQ void m16000_sxx (KERN_ATTR_VECTOR ())
     s_skb[7][i] = c_skb[7][i];
   }
 
-  LOCAL_AS u32 s_tripcode_salt[128];
+  LOCAL_VK u32 s_tripcode_salt[128];
 
   for (u32 i = lid; i < 128; i += lsz)
   {
     s_tripcode_salt[i] = c_tripcode_salt[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m16100_a0-optimized.cl b/OpenCL/m16100_a0-optimized.cl
index 45b050973..538b15ef3 100644
--- a/OpenCL/m16100_a0-optimized.cl
+++ b/OpenCL/m16100_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m16100_a0-pure.cl b/OpenCL/m16100_a0-pure.cl
index f6aa82060..4ec287d3b 100644
--- a/OpenCL/m16100_a0-pure.cl
+++ b/OpenCL/m16100_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m16100_a1-optimized.cl b/OpenCL/m16100_a1-optimized.cl
index 2c3cece99..3790ea971 100644
--- a/OpenCL/m16100_a1-optimized.cl
+++ b/OpenCL/m16100_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m16100_a1-pure.cl b/OpenCL/m16100_a1-pure.cl
index 04f5d8a57..5225ba685 100644
--- a/OpenCL/m16100_a1-pure.cl
+++ b/OpenCL/m16100_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md5.cl"
 #endif
diff --git a/OpenCL/m16100_a3-optimized.cl b/OpenCL/m16100_a3-optimized.cl
index ba96def6b..38aa22be4 100644
--- a/OpenCL/m16100_a3-optimized.cl
+++ b/OpenCL/m16100_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m16100_a3-pure.cl b/OpenCL/m16100_a3-pure.cl
index 3dfb0cc98..daa0f1de3 100644
--- a/OpenCL/m16100_a3-pure.cl
+++ b/OpenCL/m16100_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md5.cl"
 #endif
diff --git a/OpenCL/m16200-pure.cl b/OpenCL/m16200-pure.cl
index ee1906bee..5f48e8378 100644
--- a/OpenCL/m16200-pure.cl
+++ b/OpenCL/m16200-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -280,17 +281,17 @@ KERNEL_FQ void m16200_comp (KERN_ATTR_TMPS_ESALT (apple_secure_notes_tmp_t, appl
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -307,7 +308,7 @@ KERNEL_FQ void m16200_comp (KERN_ATTR_TMPS_ESALT (apple_secure_notes_tmp_t, appl
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m16300-pure.cl b/OpenCL/m16300-pure.cl
index 908adb69f..beeba498d 100644
--- a/OpenCL/m16300-pure.cl
+++ b/OpenCL/m16300-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -35,7 +36,7 @@ typedef struct pbkdf2_sha256_tmp
 
 } pbkdf2_sha256_tmp_t;
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
@@ -414,17 +415,17 @@ KERNEL_FQ void m16300_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, ethereum_
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -441,7 +442,7 @@ KERNEL_FQ void m16300_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, ethereum_
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m16400_a0-optimized.cl b/OpenCL/m16400_a0-optimized.cl
index 520d2b861..049b33b37 100644
--- a/OpenCL/m16400_a0-optimized.cl
+++ b/OpenCL/m16400_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m16400_a0-pure.cl b/OpenCL/m16400_a0-pure.cl
index 87ad14899..7857c8e1e 100644
--- a/OpenCL/m16400_a0-pure.cl
+++ b/OpenCL/m16400_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m16400_a1-optimized.cl b/OpenCL/m16400_a1-optimized.cl
index 674a6016e..6082d7bc0 100644
--- a/OpenCL/m16400_a1-optimized.cl
+++ b/OpenCL/m16400_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_simd.cl"
diff --git a/OpenCL/m16400_a1-pure.cl b/OpenCL/m16400_a1-pure.cl
index 6392f42d1..4f3dcd36c 100644
--- a/OpenCL/m16400_a1-pure.cl
+++ b/OpenCL/m16400_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m16400_a3-optimized.cl b/OpenCL/m16400_a3-optimized.cl
index fd6967267..fa0e00c8b 100644
--- a/OpenCL/m16400_a3-optimized.cl
+++ b/OpenCL/m16400_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m16400_a3-pure.cl b/OpenCL/m16400_a3-pure.cl
index 9532b58f4..6d3fef2fa 100644
--- a/OpenCL/m16400_a3-pure.cl
+++ b/OpenCL/m16400_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m16511_a0-pure.cl b/OpenCL/m16511_a0-pure.cl
index cb742d0f6..bb97112a7 100644
--- a/OpenCL/m16511_a0-pure.cl
+++ b/OpenCL/m16511_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m16511_a1-pure.cl b/OpenCL/m16511_a1-pure.cl
index 9d566b93c..f875823f0 100644
--- a/OpenCL/m16511_a1-pure.cl
+++ b/OpenCL/m16511_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m16511_a3-pure.cl b/OpenCL/m16511_a3-pure.cl
index 08841d710..b86939fea 100644
--- a/OpenCL/m16511_a3-pure.cl
+++ b/OpenCL/m16511_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m16512_a0-pure.cl b/OpenCL/m16512_a0-pure.cl
index 121d7c1bf..e92122ec4 100644
--- a/OpenCL/m16512_a0-pure.cl
+++ b/OpenCL/m16512_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m16512_a1-pure.cl b/OpenCL/m16512_a1-pure.cl
index 14e88469b..4ef07bb67 100644
--- a/OpenCL/m16512_a1-pure.cl
+++ b/OpenCL/m16512_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha384.cl"
diff --git a/OpenCL/m16512_a3-pure.cl b/OpenCL/m16512_a3-pure.cl
index 8b1ff6baf..bae71e6c4 100644
--- a/OpenCL/m16512_a3-pure.cl
+++ b/OpenCL/m16512_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha384.cl"
diff --git a/OpenCL/m16513_a0-pure.cl b/OpenCL/m16513_a0-pure.cl
index 4940297bc..fa8c6cb47 100644
--- a/OpenCL/m16513_a0-pure.cl
+++ b/OpenCL/m16513_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m16513_a1-pure.cl b/OpenCL/m16513_a1-pure.cl
index c064f5d45..131931b44 100644
--- a/OpenCL/m16513_a1-pure.cl
+++ b/OpenCL/m16513_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m16513_a3-pure.cl b/OpenCL/m16513_a3-pure.cl
index 476dd6a3d..1f7e3150d 100644
--- a/OpenCL/m16513_a3-pure.cl
+++ b/OpenCL/m16513_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m16600_a0-optimized.cl b/OpenCL/m16600_a0-optimized.cl
index c8751feb5..61cd6a5ae 100644
--- a/OpenCL/m16600_a0-optimized.cl
+++ b/OpenCL/m16600_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -40,17 +41,17 @@ KERNEL_FQ void m16600_m04 (KERN_ATTR_RULES_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -67,7 +68,7 @@ KERNEL_FQ void m16600_m04 (KERN_ATTR_RULES_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -425,17 +426,17 @@ KERNEL_FQ void m16600_s04 (KERN_ATTR_RULES_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -452,7 +453,7 @@ KERNEL_FQ void m16600_s04 (KERN_ATTR_RULES_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m16600_a0-pure.cl b/OpenCL/m16600_a0-pure.cl
index 358543031..3e062770f 100644
--- a/OpenCL/m16600_a0-pure.cl
+++ b/OpenCL/m16600_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -40,17 +41,17 @@ KERNEL_FQ void m16600_mxx (KERN_ATTR_RULES_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -67,7 +68,7 @@ KERNEL_FQ void m16600_mxx (KERN_ATTR_RULES_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -231,17 +232,17 @@ KERNEL_FQ void m16600_sxx (KERN_ATTR_RULES_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -258,7 +259,7 @@ KERNEL_FQ void m16600_sxx (KERN_ATTR_RULES_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m16600_a1-optimized.cl b/OpenCL/m16600_a1-optimized.cl
index c9c8212b6..f67fc3205 100644
--- a/OpenCL/m16600_a1-optimized.cl
+++ b/OpenCL/m16600_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -38,17 +39,17 @@ KERNEL_FQ void m16600_m04 (KERN_ATTR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -65,7 +66,7 @@ KERNEL_FQ void m16600_m04 (KERN_ATTR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -481,17 +482,17 @@ KERNEL_FQ void m16600_s04 (KERN_ATTR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -508,7 +509,7 @@ KERNEL_FQ void m16600_s04 (KERN_ATTR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m16600_a1-pure.cl b/OpenCL/m16600_a1-pure.cl
index a0be77baa..85b62b520 100644
--- a/OpenCL/m16600_a1-pure.cl
+++ b/OpenCL/m16600_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha256.cl"
@@ -38,17 +39,17 @@ KERNEL_FQ void m16600_mxx (KERN_ATTR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -65,7 +66,7 @@ KERNEL_FQ void m16600_mxx (KERN_ATTR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -227,17 +228,17 @@ KERNEL_FQ void m16600_sxx (KERN_ATTR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -254,7 +255,7 @@ KERNEL_FQ void m16600_sxx (KERN_ATTR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m16600_a3-optimized.cl b/OpenCL/m16600_a3-optimized.cl
index 04a1a7138..edc13bfd6 100644
--- a/OpenCL/m16600_a3-optimized.cl
+++ b/OpenCL/m16600_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -335,17 +336,17 @@ KERNEL_FQ void m16600_m04 (KERN_ATTR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -362,7 +363,7 @@ KERNEL_FQ void m16600_m04 (KERN_ATTR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -430,17 +431,17 @@ KERNEL_FQ void m16600_m08 (KERN_ATTR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -457,7 +458,7 @@ KERNEL_FQ void m16600_m08 (KERN_ATTR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -525,17 +526,17 @@ KERNEL_FQ void m16600_m16 (KERN_ATTR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -552,7 +553,7 @@ KERNEL_FQ void m16600_m16 (KERN_ATTR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -620,17 +621,17 @@ KERNEL_FQ void m16600_s04 (KERN_ATTR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -647,7 +648,7 @@ KERNEL_FQ void m16600_s04 (KERN_ATTR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -715,17 +716,17 @@ KERNEL_FQ void m16600_s08 (KERN_ATTR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -742,7 +743,7 @@ KERNEL_FQ void m16600_s08 (KERN_ATTR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -810,17 +811,17 @@ KERNEL_FQ void m16600_s16 (KERN_ATTR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -837,7 +838,7 @@ KERNEL_FQ void m16600_s16 (KERN_ATTR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m16600_a3-pure.cl b/OpenCL/m16600_a3-pure.cl
index 7a68e1ff2..8c46f555e 100644
--- a/OpenCL/m16600_a3-pure.cl
+++ b/OpenCL/m16600_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -38,17 +39,17 @@ KERNEL_FQ void m16600_mxx (KERN_ATTR_VECTOR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -65,7 +66,7 @@ KERNEL_FQ void m16600_mxx (KERN_ATTR_VECTOR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -240,17 +241,17 @@ KERNEL_FQ void m16600_sxx (KERN_ATTR_VECTOR_ESALT (electrum_wallet_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -267,7 +268,7 @@ KERNEL_FQ void m16600_sxx (KERN_ATTR_VECTOR_ESALT (electrum_wallet_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m16800-pure.cl b/OpenCL/m16800-pure.cl
index f7e3e1f9c..d8c3a26c9 100644
--- a/OpenCL/m16800-pure.cl
+++ b/OpenCL/m16800-pure.cl
@@ -8,12 +8,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #else
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_simd.h"
 #include "inc_hash_sha1.h"
diff --git a/OpenCL/m16801-pure.cl b/OpenCL/m16801-pure.cl
index f040be21c..0e93e669c 100644
--- a/OpenCL/m16801-pure.cl
+++ b/OpenCL/m16801-pure.cl
@@ -8,12 +8,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
 #else
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.h"
 #include "inc_common.h"
 #include "inc_simd.h"
 #include "inc_hash_sha1.h"
diff --git a/OpenCL/m16900-pure.cl b/OpenCL/m16900-pure.cl
index 649a16b2e..f9f2357b9 100644
--- a/OpenCL/m16900-pure.cl
+++ b/OpenCL/m16900-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m17300_a0-optimized.cl b/OpenCL/m17300_a0-optimized.cl
index 30076e82a..a1c7bd237 100644
--- a/OpenCL/m17300_a0-optimized.cl
+++ b/OpenCL/m17300_a0-optimized.cl
@@ -8,13 +8,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17300_a1-optimized.cl b/OpenCL/m17300_a1-optimized.cl
index 3e0e5970d..cc3c5ac5b 100644
--- a/OpenCL/m17300_a1-optimized.cl
+++ b/OpenCL/m17300_a1-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17300_a3-optimized.cl b/OpenCL/m17300_a3-optimized.cl
index 2948331db..cf78c55e6 100644
--- a/OpenCL/m17300_a3-optimized.cl
+++ b/OpenCL/m17300_a3-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17400_a0-optimized.cl b/OpenCL/m17400_a0-optimized.cl
index be8eb241e..8dbefbd72 100644
--- a/OpenCL/m17400_a0-optimized.cl
+++ b/OpenCL/m17400_a0-optimized.cl
@@ -8,13 +8,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17400_a1-optimized.cl b/OpenCL/m17400_a1-optimized.cl
index 6b3c9c14b..96c99c25b 100644
--- a/OpenCL/m17400_a1-optimized.cl
+++ b/OpenCL/m17400_a1-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17400_a3-optimized.cl b/OpenCL/m17400_a3-optimized.cl
index 19051f0a2..1ce91475c 100644
--- a/OpenCL/m17400_a3-optimized.cl
+++ b/OpenCL/m17400_a3-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17500_a0-optimized.cl b/OpenCL/m17500_a0-optimized.cl
index 60f807b90..10a029a81 100644
--- a/OpenCL/m17500_a0-optimized.cl
+++ b/OpenCL/m17500_a0-optimized.cl
@@ -8,13 +8,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17500_a1-optimized.cl b/OpenCL/m17500_a1-optimized.cl
index a402b7afa..3a62ba394 100644
--- a/OpenCL/m17500_a1-optimized.cl
+++ b/OpenCL/m17500_a1-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17500_a3-optimized.cl b/OpenCL/m17500_a3-optimized.cl
index fe6863dbd..91705b5c4 100644
--- a/OpenCL/m17500_a3-optimized.cl
+++ b/OpenCL/m17500_a3-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17600_a0-optimized.cl b/OpenCL/m17600_a0-optimized.cl
index 97394ad0b..0b32f69b5 100644
--- a/OpenCL/m17600_a0-optimized.cl
+++ b/OpenCL/m17600_a0-optimized.cl
@@ -8,13 +8,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17600_a1-optimized.cl b/OpenCL/m17600_a1-optimized.cl
index dac85837d..5d5117736 100644
--- a/OpenCL/m17600_a1-optimized.cl
+++ b/OpenCL/m17600_a1-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17600_a3-optimized.cl b/OpenCL/m17600_a3-optimized.cl
index 37fef9a7b..7dde3b61a 100644
--- a/OpenCL/m17600_a3-optimized.cl
+++ b/OpenCL/m17600_a3-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17700_a0-optimized.cl b/OpenCL/m17700_a0-optimized.cl
index cdb6d1ede..00355e29a 100644
--- a/OpenCL/m17700_a0-optimized.cl
+++ b/OpenCL/m17700_a0-optimized.cl
@@ -8,13 +8,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17700_a1-optimized.cl b/OpenCL/m17700_a1-optimized.cl
index d3b6d6a14..88527e9b6 100644
--- a/OpenCL/m17700_a1-optimized.cl
+++ b/OpenCL/m17700_a1-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17700_a3-optimized.cl b/OpenCL/m17700_a3-optimized.cl
index 7cdd1b22f..70a3f42d0 100644
--- a/OpenCL/m17700_a3-optimized.cl
+++ b/OpenCL/m17700_a3-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17800_a0-optimized.cl b/OpenCL/m17800_a0-optimized.cl
index b16c36c38..193f9ce6e 100644
--- a/OpenCL/m17800_a0-optimized.cl
+++ b/OpenCL/m17800_a0-optimized.cl
@@ -8,13 +8,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17800_a1-optimized.cl b/OpenCL/m17800_a1-optimized.cl
index 8e7054235..cc1e2ca52 100644
--- a/OpenCL/m17800_a1-optimized.cl
+++ b/OpenCL/m17800_a1-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17800_a3-optimized.cl b/OpenCL/m17800_a3-optimized.cl
index f62ab2a0e..3de33b43a 100644
--- a/OpenCL/m17800_a3-optimized.cl
+++ b/OpenCL/m17800_a3-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17900_a0-optimized.cl b/OpenCL/m17900_a0-optimized.cl
index b28cce05b..1e9e68e9d 100644
--- a/OpenCL/m17900_a0-optimized.cl
+++ b/OpenCL/m17900_a0-optimized.cl
@@ -8,13 +8,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17900_a1-optimized.cl b/OpenCL/m17900_a1-optimized.cl
index a182385f4..e26e19987 100644
--- a/OpenCL/m17900_a1-optimized.cl
+++ b/OpenCL/m17900_a1-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m17900_a3-optimized.cl b/OpenCL/m17900_a3-optimized.cl
index afeb00627..db78824ad 100644
--- a/OpenCL/m17900_a3-optimized.cl
+++ b/OpenCL/m17900_a3-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m18000_a0-optimized.cl b/OpenCL/m18000_a0-optimized.cl
index 3be89d66d..6a2f4eeb4 100644
--- a/OpenCL/m18000_a0-optimized.cl
+++ b/OpenCL/m18000_a0-optimized.cl
@@ -8,13 +8,14 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m18000_a1-optimized.cl b/OpenCL/m18000_a1-optimized.cl
index 30164893d..36caf6052 100644
--- a/OpenCL/m18000_a1-optimized.cl
+++ b/OpenCL/m18000_a1-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m18000_a3-optimized.cl b/OpenCL/m18000_a3-optimized.cl
index b6f7a33f8..2dee6304a 100644
--- a/OpenCL/m18000_a3-optimized.cl
+++ b/OpenCL/m18000_a3-optimized.cl
@@ -8,11 +8,12 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #endif
 
-CONSTANT_AS u64a keccakf_rndc[24] =
+CONSTANT_VK u64a keccakf_rndc[24] =
 {
   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
diff --git a/OpenCL/m18100_a0-pure.cl b/OpenCL/m18100_a0-pure.cl
index d22b3bade..d923a95e8 100644
--- a/OpenCL/m18100_a0-pure.cl
+++ b/OpenCL/m18100_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m18100_a1-pure.cl b/OpenCL/m18100_a1-pure.cl
index 223cbece0..2b170bc4a 100644
--- a/OpenCL/m18100_a1-pure.cl
+++ b/OpenCL/m18100_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m18100_a3-pure.cl b/OpenCL/m18100_a3-pure.cl
index be4805773..ffd2525fe 100644
--- a/OpenCL/m18100_a3-pure.cl
+++ b/OpenCL/m18100_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m18200_a0-optimized.cl b/OpenCL/m18200_a0-optimized.cl
index a520da74d..3791d2058 100644
--- a/OpenCL/m18200_a0-optimized.cl
+++ b/OpenCL/m18200_a0-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
@@ -607,7 +608,7 @@ KERNEL_FQ void m18200_m04 (KERN_ATTR_RULES_ESALT (krb5asrep_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -705,7 +706,7 @@ KERNEL_FQ void m18200_s04 (KERN_ATTR_RULES_ESALT (krb5asrep_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m18200_a0-pure.cl b/OpenCL/m18200_a0-pure.cl
index 497ac3a3c..ba15f1908 100644
--- a/OpenCL/m18200_a0-pure.cl
+++ b/OpenCL/m18200_a0-pure.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -401,7 +402,7 @@ KERNEL_FQ void m18200_mxx (KERN_ATTR_RULES_ESALT (krb5asrep_t))
 
   COPY_PW (pws[gid]);
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -463,7 +464,7 @@ KERNEL_FQ void m18200_sxx (KERN_ATTR_RULES_ESALT (krb5asrep_t))
 
   COPY_PW (pws[gid]);
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m18200_a1-optimized.cl b/OpenCL/m18200_a1-optimized.cl
index 0d9efe343..e66367c08 100644
--- a/OpenCL/m18200_a1-optimized.cl
+++ b/OpenCL/m18200_a1-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -604,7 +605,7 @@ KERNEL_FQ void m18200_m04 (KERN_ATTR_ESALT (krb5asrep_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -751,7 +752,7 @@ KERNEL_FQ void m18200_s04 (KERN_ATTR_ESALT (krb5asrep_t))
    * shared
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m18200_a1-pure.cl b/OpenCL/m18200_a1-pure.cl
index 519c8498d..3817e1e8a 100644
--- a/OpenCL/m18200_a1-pure.cl
+++ b/OpenCL/m18200_a1-pure.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md4.cl"
 #include "inc_hash_md5.cl"
@@ -397,7 +398,7 @@ KERNEL_FQ void m18200_mxx (KERN_ATTR_ESALT (krb5asrep_t))
    * base
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -457,7 +458,7 @@ KERNEL_FQ void m18200_sxx (KERN_ATTR_ESALT (krb5asrep_t))
    * base
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m18200_a3-optimized.cl b/OpenCL/m18200_a3-optimized.cl
index 404c56521..dce8dacf6 100644
--- a/OpenCL/m18200_a3-optimized.cl
+++ b/OpenCL/m18200_a3-optimized.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md4.cl"
@@ -674,7 +675,7 @@ KERNEL_FQ void m18200_m04 (KERN_ATTR_ESALT (krb5asrep_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -726,7 +727,7 @@ KERNEL_FQ void m18200_m08 (KERN_ATTR_ESALT (krb5asrep_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -782,7 +783,7 @@ KERNEL_FQ void m18200_s04 (KERN_ATTR_ESALT (krb5asrep_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -834,7 +835,7 @@ KERNEL_FQ void m18200_s08 (KERN_ATTR_ESALT (krb5asrep_t))
    * main
    */
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m18200_a3-pure.cl b/OpenCL/m18200_a3-pure.cl
index 30c298cfd..015cd4f36 100644
--- a/OpenCL/m18200_a3-pure.cl
+++ b/OpenCL/m18200_a3-pure.cl
@@ -9,6 +9,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_hash_md4.cl"
 #include "inc_hash_md5.cl"
@@ -406,7 +407,7 @@ KERNEL_FQ void m18200_mxx (KERN_ATTR_VECTOR_ESALT (krb5asrep_t))
     w[idx] = pws[gid].i[idx];
   }
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
@@ -479,7 +480,7 @@ KERNEL_FQ void m18200_sxx (KERN_ATTR_VECTOR_ESALT (krb5asrep_t))
     w[idx] = pws[gid].i[idx];
   }
 
-  LOCAL_AS RC4_KEY rc4_keys[64];
+  LOCAL_VK RC4_KEY rc4_keys[64];
 
   LOCAL_AS RC4_KEY *rc4_key = &rc4_keys[lid];
 
diff --git a/OpenCL/m18300-pure.cl b/OpenCL/m18300-pure.cl
index 6ae8fba68..46e52a1aa 100644
--- a/OpenCL/m18300-pure.cl
+++ b/OpenCL/m18300-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
@@ -280,17 +281,17 @@ KERNEL_FQ void m18300_comp (KERN_ATTR_TMPS_ESALT (apple_secure_notes_tmp_t, appl
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -307,7 +308,7 @@ KERNEL_FQ void m18300_comp (KERN_ATTR_TMPS_ESALT (apple_secure_notes_tmp_t, appl
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m18400-pure.cl b/OpenCL/m18400-pure.cl
index f7abaea8d..b6645e8a9 100644
--- a/OpenCL/m18400-pure.cl
+++ b/OpenCL/m18400-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -276,17 +277,17 @@ KERNEL_FQ void m18400_comp (KERN_ATTR_TMPS_ESALT (odf12_tmp_t, odf12_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -303,7 +304,7 @@ KERNEL_FQ void m18400_comp (KERN_ATTR_TMPS_ESALT (odf12_tmp_t, odf12_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m18500_a0-pure.cl b/OpenCL/m18500_a0-pure.cl
index b4cf2faf6..857d380ee 100644
--- a/OpenCL/m18500_a0-pure.cl
+++ b/OpenCL/m18500_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -17,15 +18,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m18500_mxx (KERN_ATTR_RULES ())
@@ -38,7 +39,7 @@ KERNEL_FQ void m18500_mxx (KERN_ATTR_RULES ())
   const u64 gid = get_global_id (0);
   const u64 lsz = get_local_size (0);
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -49,7 +50,7 @@ KERNEL_FQ void m18500_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -154,7 +155,7 @@ KERNEL_FQ void m18500_sxx (KERN_ATTR_RULES ())
   const u64 gid = get_global_id (0);
   const u64 lsz = get_local_size (0);
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -165,7 +166,7 @@ KERNEL_FQ void m18500_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m18500_a1-pure.cl b/OpenCL/m18500_a1-pure.cl
index cc48a7134..4de2e20cb 100644
--- a/OpenCL/m18500_a1-pure.cl
+++ b/OpenCL/m18500_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -17,15 +18,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m18500_mxx (KERN_ATTR_RULES ())
@@ -38,7 +39,7 @@ KERNEL_FQ void m18500_mxx (KERN_ATTR_RULES ())
   const u64 gid = get_global_id (0);
   const u64 lsz = get_local_size (0);
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -49,7 +50,7 @@ KERNEL_FQ void m18500_mxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -153,7 +154,7 @@ KERNEL_FQ void m18500_sxx (KERN_ATTR_RULES ())
   const u64 gid = get_global_id (0);
   const u64 lsz = get_local_size (0);
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -164,7 +165,7 @@ KERNEL_FQ void m18500_sxx (KERN_ATTR_RULES ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m18500_a3-pure.cl b/OpenCL/m18500_a3-pure.cl
index 46f8624f7..8eb1dac9d 100644
--- a/OpenCL/m18500_a3-pure.cl
+++ b/OpenCL/m18500_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
@@ -15,15 +16,15 @@
 #endif
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m18500_mxx (KERN_ATTR_VECTOR ())
@@ -36,7 +37,7 @@ KERNEL_FQ void m18500_mxx (KERN_ATTR_VECTOR ())
   const u64 gid = get_global_id (0);
   const u64 lsz = get_local_size (0);
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -47,7 +48,7 @@ KERNEL_FQ void m18500_mxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -163,7 +164,7 @@ KERNEL_FQ void m18500_sxx (KERN_ATTR_VECTOR ())
   const u64 gid = get_global_id (0);
   const u64 lsz = get_local_size (0);
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -174,7 +175,7 @@ KERNEL_FQ void m18500_sxx (KERN_ATTR_VECTOR ())
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m18600-pure.cl b/OpenCL/m18600-pure.cl
index b939a4381..930b85f59 100644
--- a/OpenCL/m18600-pure.cl
+++ b/OpenCL/m18600-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -37,7 +38,7 @@ typedef struct odf11
 
 // http://www.schneier.com/code/constants.txt
 
-CONSTANT_AS u32a c_sbox0[256] =
+CONSTANT_VK u32a c_sbox0[256] =
 {
   0xd1310ba6, 0x98dfb5ac, 0x2ffd72db, 0xd01adfb7,
   0xb8e1afed, 0x6a267e96, 0xba7c9045, 0xf12c7f99,
@@ -105,7 +106,7 @@ CONSTANT_AS u32a c_sbox0[256] =
   0x53b02d5d, 0xa99f8fa1, 0x08ba4799, 0x6e85076a
 };
 
-CONSTANT_AS u32a c_sbox1[256] =
+CONSTANT_VK u32a c_sbox1[256] =
 {
   0x4b7a70e9, 0xb5b32944, 0xdb75092e, 0xc4192623,
   0xad6ea6b0, 0x49a7df7d, 0x9cee60b8, 0x8fedb266,
@@ -173,7 +174,7 @@ CONSTANT_AS u32a c_sbox1[256] =
   0x153e21e7, 0x8fb03d4a, 0xe6e39f2b, 0xdb83adf7
 };
 
-CONSTANT_AS u32a c_sbox2[256] =
+CONSTANT_VK u32a c_sbox2[256] =
 {
   0xe93d5a68, 0x948140f7, 0xf64c261c, 0x94692934,
   0x411520f7, 0x7602d4f7, 0xbcf46b2e, 0xd4a20068,
@@ -241,7 +242,7 @@ CONSTANT_AS u32a c_sbox2[256] =
   0xd79a3234, 0x92638212, 0x670efa8e, 0x406000e0
 };
 
-CONSTANT_AS u32a c_sbox3[256] =
+CONSTANT_VK u32a c_sbox3[256] =
 {
   0x3a39ce37, 0xd3faf5cf, 0xabc27737, 0x5ac52d1b,
   0x5cb0679e, 0x4fa33742, 0xd3822740, 0x99bc9bbe,
@@ -309,7 +310,7 @@ CONSTANT_AS u32a c_sbox3[256] =
   0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6
 };
 
-CONSTANT_AS u32a c_pbox[18] =
+CONSTANT_VK u32a c_pbox[18] =
 {
   0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,
   0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
@@ -615,10 +616,10 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m18
     P[i] = c_pbox[i] ^ ukey[i % 4];
   }
 
-  LOCAL_AS u32 S0_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S1_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S2_all[FIXED_LOCAL_SIZE][256];
-  LOCAL_AS u32 S3_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
+  LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256];
 
   LOCAL_AS u32 *S0 = S0_all[lid];
   LOCAL_AS u32 *S1 = S1_all[lid];
diff --git a/OpenCL/m18700_a0-optimized.cl b/OpenCL/m18700_a0-optimized.cl
index de8eabbe9..ba5ccff14 100644
--- a/OpenCL/m18700_a0-optimized.cl
+++ b/OpenCL/m18700_a0-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp_optimized.h"
 #include "inc_rp_optimized.cl"
diff --git a/OpenCL/m18700_a0-pure.cl b/OpenCL/m18700_a0-pure.cl
index a7276b9eb..b7ee45dae 100644
--- a/OpenCL/m18700_a0-pure.cl
+++ b/OpenCL/m18700_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m18700_a1-optimized.cl b/OpenCL/m18700_a1-optimized.cl
index 215b0bcfc..18ea7ffbd 100644
--- a/OpenCL/m18700_a1-optimized.cl
+++ b/OpenCL/m18700_a1-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_simd.cl"
diff --git a/OpenCL/m18700_a1-pure.cl b/OpenCL/m18700_a1-pure.cl
index c80a904b7..1f69f8e90 100644
--- a/OpenCL/m18700_a1-pure.cl
+++ b/OpenCL/m18700_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m18700_a3-optimized.cl b/OpenCL/m18700_a3-optimized.cl
index 0df5cb0cc..9b09e3308 100644
--- a/OpenCL/m18700_a3-optimized.cl
+++ b/OpenCL/m18700_a3-optimized.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m18700_a3-pure.cl b/OpenCL/m18700_a3-pure.cl
index 4f6221724..286656a84 100644
--- a/OpenCL/m18700_a3-pure.cl
+++ b/OpenCL/m18700_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m18800-pure.cl b/OpenCL/m18800-pure.cl
index 8dfd54c30..e237b865a 100644
--- a/OpenCL/m18800-pure.cl
+++ b/OpenCL/m18800-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m18900-pure.cl b/OpenCL/m18900-pure.cl
index 342a203a5..0eaa5d64d 100644
--- a/OpenCL/m18900-pure.cl
+++ b/OpenCL/m18900-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -244,17 +245,17 @@ KERNEL_FQ void m18900_comp (KERN_ATTR_TMPS_ESALT (android_backup_tmp_t, android_
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -271,7 +272,7 @@ KERNEL_FQ void m18900_comp (KERN_ATTR_TMPS_ESALT (android_backup_tmp_t, android_
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m19000-pure.cl b/OpenCL/m19000-pure.cl
index 499f5a458..d46951049 100644
--- a/OpenCL/m19000-pure.cl
+++ b/OpenCL/m19000-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_md5.cl"
diff --git a/OpenCL/m19100-pure.cl b/OpenCL/m19100-pure.cl
index 21fc4328e..98a3ae43b 100644
--- a/OpenCL/m19100-pure.cl
+++ b/OpenCL/m19100-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha256.cl"
diff --git a/OpenCL/m19200-pure.cl b/OpenCL/m19200-pure.cl
index 8d40173e7..a77742ae1 100644
--- a/OpenCL/m19200-pure.cl
+++ b/OpenCL/m19200-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
diff --git a/OpenCL/m19300_a0-pure.cl b/OpenCL/m19300_a0-pure.cl
index d0e46a0cc..eebb953d6 100644
--- a/OpenCL/m19300_a0-pure.cl
+++ b/OpenCL/m19300_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
diff --git a/OpenCL/m19300_a1-pure.cl b/OpenCL/m19300_a1-pure.cl
index ea565a259..0e11ddb18 100644
--- a/OpenCL/m19300_a1-pure.cl
+++ b/OpenCL/m19300_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m19300_a3-pure.cl b/OpenCL/m19300_a3-pure.cl
index 4b65dd687..fc715f841 100644
--- a/OpenCL/m19300_a3-pure.cl
+++ b/OpenCL/m19300_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
diff --git a/OpenCL/m19500_a0-pure.cl b/OpenCL/m19500_a0-pure.cl
index 9ec4dafb1..5ebfaea0c 100644
--- a/OpenCL/m19500_a0-pure.cl
+++ b/OpenCL/m19500_a0-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_rp.h"
 #include "inc_rp.cl"
@@ -26,15 +27,15 @@ typedef struct devise_hash
 } devise_hash_t;
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m19500_mxx (KERN_ATTR_RULES_ESALT (devise_hash_t))
@@ -51,7 +52,7 @@ KERNEL_FQ void m19500_mxx (KERN_ATTR_RULES_ESALT (devise_hash_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -62,7 +63,7 @@ KERNEL_FQ void m19500_mxx (KERN_ATTR_RULES_ESALT (devise_hash_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -186,7 +187,7 @@ KERNEL_FQ void m19500_sxx (KERN_ATTR_RULES_ESALT (devise_hash_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -197,7 +198,7 @@ KERNEL_FQ void m19500_sxx (KERN_ATTR_RULES_ESALT (devise_hash_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m19500_a1-pure.cl b/OpenCL/m19500_a1-pure.cl
index 9760a360a..464f534f4 100644
--- a/OpenCL/m19500_a1-pure.cl
+++ b/OpenCL/m19500_a1-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_scalar.cl"
 #include "inc_hash_sha1.cl"
@@ -24,15 +25,15 @@ typedef struct devise_hash
 } devise_hash_t;
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m19500_mxx (KERN_ATTR_ESALT (devise_hash_t))
@@ -49,7 +50,7 @@ KERNEL_FQ void m19500_mxx (KERN_ATTR_ESALT (devise_hash_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -60,7 +61,7 @@ KERNEL_FQ void m19500_mxx (KERN_ATTR_ESALT (devise_hash_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -180,7 +181,7 @@ KERNEL_FQ void m19500_sxx (KERN_ATTR_ESALT (devise_hash_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -191,7 +192,7 @@ KERNEL_FQ void m19500_sxx (KERN_ATTR_ESALT (devise_hash_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m19500_a3-pure.cl b/OpenCL/m19500_a3-pure.cl
index 3900c5ff9..a8f0fa8cd 100644
--- a/OpenCL/m19500_a3-pure.cl
+++ b/OpenCL/m19500_a3-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -24,15 +25,15 @@ typedef struct devise_hash
 } devise_hash_t;
 
 #if   VECT_SIZE == 1
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i)])
 #elif VECT_SIZE == 2
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
 #elif VECT_SIZE == 4
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
 #elif VECT_SIZE == 8
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
 #elif VECT_SIZE == 16
-#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#define uint_to_hex_lower8_le(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
 #endif
 
 KERNEL_FQ void m19500_mxx (KERN_ATTR_VECTOR_ESALT (devise_hash_t))
@@ -49,7 +50,7 @@ KERNEL_FQ void m19500_mxx (KERN_ATTR_VECTOR_ESALT (devise_hash_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -60,7 +61,7 @@ KERNEL_FQ void m19500_mxx (KERN_ATTR_VECTOR_ESALT (devise_hash_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
@@ -195,7 +196,7 @@ KERNEL_FQ void m19500_sxx (KERN_ATTR_VECTOR_ESALT (devise_hash_t))
    * bin2asc table
    */
 
-  LOCAL_AS u32 l_bin2asc[256];
+  LOCAL_VK u32 l_bin2asc[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -206,7 +207,7 @@ KERNEL_FQ void m19500_sxx (KERN_ATTR_VECTOR_ESALT (devise_hash_t))
                  | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8;
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   if (gid >= gid_max) return;
 
diff --git a/OpenCL/m19600-pure.cl b/OpenCL/m19600-pure.cl
index 265d7aee5..1403a7c4d 100644
--- a/OpenCL/m19600-pure.cl
+++ b/OpenCL/m19600-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -296,17 +297,17 @@ KERNEL_FQ void m19600_comp (KERN_ATTR_TMPS_ESALT (krb5tgs_17_tmp_t, krb5tgs_17_t
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -323,7 +324,7 @@ KERNEL_FQ void m19600_comp (KERN_ATTR_TMPS_ESALT (krb5tgs_17_tmp_t, krb5tgs_17_t
     s_td4[i] = td4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m19700-pure.cl b/OpenCL/m19700-pure.cl
index df9b79e6b..4c7d9b466 100644
--- a/OpenCL/m19700-pure.cl
+++ b/OpenCL/m19700-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -296,17 +297,17 @@ KERNEL_FQ void m19700_comp (KERN_ATTR_TMPS_ESALT (krb5tgs_18_tmp_t, krb5tgs_18_t
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -323,7 +324,7 @@ KERNEL_FQ void m19700_comp (KERN_ATTR_TMPS_ESALT (krb5tgs_18_tmp_t, krb5tgs_18_t
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m19800-pure.cl b/OpenCL/m19800-pure.cl
index a25f22da8..bcadc5a10 100644
--- a/OpenCL/m19800-pure.cl
+++ b/OpenCL/m19800-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -296,17 +297,17 @@ KERNEL_FQ void m19800_comp (KERN_ATTR_TMPS_ESALT (krb5pa_17_tmp_t, krb5pa_17_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -323,7 +324,7 @@ KERNEL_FQ void m19800_comp (KERN_ATTR_TMPS_ESALT (krb5pa_17_tmp_t, krb5pa_17_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m19900-pure.cl b/OpenCL/m19900-pure.cl
index 0d8cccc01..ed646ead9 100644
--- a/OpenCL/m19900-pure.cl
+++ b/OpenCL/m19900-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha1.cl"
@@ -297,17 +298,17 @@ KERNEL_FQ void m19900_comp (KERN_ATTR_TMPS_ESALT (krb5pa_18_tmp_t, krb5pa_18_t))
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -324,7 +325,7 @@ KERNEL_FQ void m19900_comp (KERN_ATTR_TMPS_ESALT (krb5pa_18_tmp_t, krb5pa_18_t))
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
diff --git a/OpenCL/m20011-pure.cl b/OpenCL/m20011-pure.cl
index 3fed10939..247191796 100644
--- a/OpenCL/m20011-pure.cl
+++ b/OpenCL/m20011-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -333,17 +334,17 @@ KERNEL_FQ void m20011_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -360,7 +361,7 @@ KERNEL_FQ void m20011_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -404,14 +405,6 @@ KERNEL_FQ void m20011_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
   ukey2[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[7]));
   ukey2[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[7]));
 
-  if (dcrp_verify_header_aes (digests_buf[digests_offset].digest_buf, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
-    }
-  }
-
   if (dcrp_verify_header_serpent (digests_buf[digests_offset].digest_buf, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[digests_offset]) == 0)
@@ -427,4 +420,12 @@ KERNEL_FQ void m20011_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
     }
   }
+
+  if (dcrp_verify_header_aes (digests_buf[digests_offset].digest_buf, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/m20012-pure.cl b/OpenCL/m20012-pure.cl
index 2a75acd23..823cec6e7 100644
--- a/OpenCL/m20012-pure.cl
+++ b/OpenCL/m20012-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -333,17 +334,17 @@ KERNEL_FQ void m20012_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -360,7 +361,7 @@ KERNEL_FQ void m20012_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -404,14 +405,6 @@ KERNEL_FQ void m20012_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
   ukey2[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[7]));
   ukey2[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[7]));
 
-  if (dcrp_verify_header_aes (digests_buf[digests_offset].digest_buf, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
-    }
-  }
-
   if (dcrp_verify_header_serpent (digests_buf[digests_offset].digest_buf, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[digests_offset]) == 0)
@@ -428,6 +421,14 @@ KERNEL_FQ void m20012_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
     }
   }
 
+  if (dcrp_verify_header_aes (digests_buf[digests_offset].digest_buf, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
+    }
+  }
+
   u32 ukey3[8];
 
   ukey3[0] = hc_swap32_S (h32_from_64_S (tmps[gid].out[ 8]));
@@ -450,14 +451,6 @@ KERNEL_FQ void m20012_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
   ukey4[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[15]));
   ukey4[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[15]));
 
-  if (dcrp_verify_header_aes_twofish (digests_buf[digests_offset].digest_buf, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
-    }
-  }
-
   if (dcrp_verify_header_serpent_aes (digests_buf[digests_offset].digest_buf, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[digests_offset]) == 0)
@@ -473,4 +466,12 @@ KERNEL_FQ void m20012_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
     }
   }
+
+  if (dcrp_verify_header_aes_twofish (digests_buf[digests_offset].digest_buf, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/m20013-pure.cl b/OpenCL/m20013-pure.cl
index f7f1239a9..de9a55c45 100644
--- a/OpenCL/m20013-pure.cl
+++ b/OpenCL/m20013-pure.cl
@@ -8,6 +8,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #include "inc_common.cl"
 #include "inc_simd.cl"
 #include "inc_hash_sha512.cl"
@@ -333,17 +334,17 @@ KERNEL_FQ void m20013_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
 
   #ifdef REAL_SHM
 
-  LOCAL_AS u32 s_td0[256];
-  LOCAL_AS u32 s_td1[256];
-  LOCAL_AS u32 s_td2[256];
-  LOCAL_AS u32 s_td3[256];
-  LOCAL_AS u32 s_td4[256];
+  LOCAL_VK u32 s_td0[256];
+  LOCAL_VK u32 s_td1[256];
+  LOCAL_VK u32 s_td2[256];
+  LOCAL_VK u32 s_td3[256];
+  LOCAL_VK u32 s_td4[256];
 
-  LOCAL_AS u32 s_te0[256];
-  LOCAL_AS u32 s_te1[256];
-  LOCAL_AS u32 s_te2[256];
-  LOCAL_AS u32 s_te3[256];
-  LOCAL_AS u32 s_te4[256];
+  LOCAL_VK u32 s_te0[256];
+  LOCAL_VK u32 s_te1[256];
+  LOCAL_VK u32 s_te2[256];
+  LOCAL_VK u32 s_te3[256];
+  LOCAL_VK u32 s_te4[256];
 
   for (u32 i = lid; i < 256; i += lsz)
   {
@@ -360,7 +361,7 @@ KERNEL_FQ void m20013_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
     s_te4[i] = te4[i];
   }
 
-  barrier (CLK_LOCAL_MEM_FENCE);
+  SYNC_THREADS ();
 
   #else
 
@@ -404,14 +405,6 @@ KERNEL_FQ void m20013_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
   ukey2[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[7]));
   ukey2[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[7]));
 
-  if (dcrp_verify_header_aes (digests_buf[digests_offset].digest_buf, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
-    }
-  }
-
   if (dcrp_verify_header_serpent (digests_buf[digests_offset].digest_buf, ukey1, ukey2) == 1)
   {
     if (atomic_inc (&hashes_shown[digests_offset]) == 0)
@@ -428,6 +421,14 @@ KERNEL_FQ void m20013_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
     }
   }
 
+  if (dcrp_verify_header_aes (digests_buf[digests_offset].digest_buf, ukey1, ukey2, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
+    }
+  }
+
   u32 ukey3[8];
 
   ukey3[0] = hc_swap32_S (h32_from_64_S (tmps[gid].out[ 8]));
@@ -450,14 +451,6 @@ KERNEL_FQ void m20013_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
   ukey4[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[15]));
   ukey4[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[15]));
 
-  if (dcrp_verify_header_aes_twofish (digests_buf[digests_offset].digest_buf, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
-    }
-  }
-
   if (dcrp_verify_header_serpent_aes (digests_buf[digests_offset].digest_buf, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[digests_offset]) == 0)
@@ -474,6 +467,14 @@ KERNEL_FQ void m20013_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
     }
   }
 
+  if (dcrp_verify_header_aes_twofish (digests_buf[digests_offset].digest_buf, ukey1, ukey2, ukey3, ukey4, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
+    }
+  }
+
   u32 ukey5[8];
 
   ukey5[0] = hc_swap32_S (h32_from_64_S (tmps[gid].out[16]));
@@ -496,14 +497,6 @@ KERNEL_FQ void m20013_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
   ukey6[6] = hc_swap32_S (h32_from_64_S (tmps[gid].out[23]));
   ukey6[7] = hc_swap32_S (l32_from_64_S (tmps[gid].out[23]));
 
-  if (dcrp_verify_header_aes_twofish_serpent (digests_buf[digests_offset].digest_buf, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
-  {
-    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
-    {
-      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
-    }
-  }
-
   if (dcrp_verify_header_serpent_twofish_aes (digests_buf[digests_offset].digest_buf, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
   {
     if (atomic_inc (&hashes_shown[digests_offset]) == 0)
@@ -511,4 +504,12 @@ KERNEL_FQ void m20013_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha512_tmp_t, diskcrypt
       mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
     }
   }
+
+  if (dcrp_verify_header_aes_twofish_serpent (digests_buf[digests_offset].digest_buf, ukey1, ukey2, ukey3, ukey4, ukey5, ukey6, s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4) == 1)
+  {
+    if (atomic_inc (&hashes_shown[digests_offset]) == 0)
+    {
+      mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset, gid, il_pos, 0, 0);
+    }
+  }
 }
diff --git a/OpenCL/markov_be.cl b/OpenCL/markov_be.cl
index cdf716ab9..a69e5e691 100644
--- a/OpenCL/markov_be.cl
+++ b/OpenCL/markov_be.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #endif
 
 #define CHARSIZ 256
diff --git a/OpenCL/markov_le.cl b/OpenCL/markov_le.cl
index cb5a91321..f1feb7819 100644
--- a/OpenCL/markov_le.cl
+++ b/OpenCL/markov_le.cl
@@ -6,6 +6,7 @@
 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
 #include "inc_types.h"
+#include "inc_platform.cl"
 #endif
 
 #define CHARSIZ 256
diff --git a/docs/changes.txt b/docs/changes.txt
index 33de0e704..00b39a086 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -57,10 +57,12 @@
 - OpenCL Runtime: Disable OpenCL kernel cache on Apple for Intel CPU (throws CL_BUILD_PROGRAM_FAILURE for no reason)
 - OpenCL Runtime: Improve ROCM detection and make sure to not confuse with recent AMDGPU drivers
 - OpenCL Runtime: Not using amd_bytealign (amd_bitalign is fine) on AMDGPU driver drastically reduces JiT segfaults
+- OpenCL Runtime: Unlocked maximum thread count
 - OpenCL Runtime: Update unstable mode warnings for Apple and AMDGPU drivers
 - OpenCL Runtime: Workaround JiT compiler error on AMDGPU driver compiling WPA-EAPOL-PBKDF2 OpenCL kernel
 - OpenCL Runtime: Workaround JiT compiler error on ROCM 2.3 driver if the 'inline' keyword is used in function declaration
 - OpenCL Runtime: Workaround memory allocation error on AMD driver on Windows leading to CL_MEM_OBJECT_ALLOCATION_FAILURE
+- Startup Screen: Provide an estimate of host memory requirements for the requested attack
 - Tuning Database: Updated hashcat.hctune with new models and refreshed vector width values
 - WPA/WPA2 cracking: In the potfile, replace password with PMK in order to detect already cracked networks across all WPA modes
 
@@ -75,6 +77,8 @@
 - Kernel Compile: Removed -cl-std= from all kernel build options since we're compatible to all OpenCL versions
 - Mode 16800/16801 hash format: Changed separator character from '*' to ':'
 - Requirements: Update runtime check for minimum NVIDIA driver version from 367.x to 418.56 or later
+- Requirements: Add new requirement for NVIDIA GPU: CUDA Toolkit (10.1 or later)
+- OpenCL Options: Removed --opencl-platforms filter in order to force backend device numbers to stay constant
 
 * changes v5.0.0 -> v5.1.0
 
diff --git a/docs/readme.txt b/docs/readme.txt
index 2614bdf0b..ae10eb231 100644
--- a/docs/readme.txt
+++ b/docs/readme.txt
@@ -6,7 +6,7 @@ AMD GPUs on Windows require "AMD Radeon Software Crimson Edition" (15.12 or late
 Intel CPUs require "OpenCL Runtime for Intel Core and Intel Xeon Processors" (16.1.1 or later)
 Intel GPUs on Linux require "OpenCL 2.0 GPU Driver Package for Linux" (2.0 or later)
 Intel GPUs on Windows require "OpenCL Driver for Intel Iris and Intel HD Graphics"
-NVIDIA GPUs require "NVIDIA Driver" (418.56 or later)
+NVIDIA GPUs require "NVIDIA Driver" (418.56 or later) and "CUDA Toolkit" (10.1 or later)
 
 ##
 ## Features
diff --git a/extra/tab_completion/hashcat.sh b/extra/tab_completion/hashcat.sh
index 2932ddad1..e44b6e0ac 100644
--- a/extra/tab_completion/hashcat.sh
+++ b/extra/tab_completion/hashcat.sh
@@ -126,7 +126,7 @@ _hashcat_get_permutations ()
   fi
 }
 
-_hashcat_opencl_devices ()
+_hashcat_backend_devices ()
 {
   local num_devices=0
 
@@ -180,7 +180,7 @@ _hashcat ()
   local HCCAPX_MESSAGE_PAIRS="0 1 2 3 4 5"
   local OUTFILE_FORMATS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15"
   local OPENCL_DEVICE_TYPES="1 2 3"
-  local OPENCL_VECTOR_WIDTH="1 2 4 8 16"
+  local BACKEND_VECTOR_WIDTH="1 2 4 8 16"
   local DEBUG_MODE="1 2 3 4"
   local WORKLOAD_PROFILE="1 2 3 4"
   local BRAIN_CLIENT_FEATURES="1 2 3"
@@ -189,8 +189,8 @@ _hashcat ()
   local BUILD_IN_CHARSETS='?l ?u ?d ?a ?b ?s ?h ?H'
 
   local SHORT_OPTS="-m -a -V -v -h -b -t -o -p -c -d -w -n -u -j -k -r -g -1 -2 -3 -4 -i -I -s -l -O -S -z"
-  local LONG_OPTS="--hash-type --attack-mode --version --help --quiet --benchmark --benchmark-all --hex-salt --hex-wordlist --hex-charset --force --status --status-json --status-timer --machine-readable --loopback --markov-hcstat2 --markov-disable --markov-classic --markov-threshold --runtime --session --speed-only --progress-only --restore --restore-file-path --restore-disable --outfile --outfile-format --outfile-autohex-disable --outfile-check-timer --outfile-check-dir --wordlist-autohex-disable --separator --show --left --username --remove --remove-timer --potfile-disable --potfile-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --example-hashes --opencl-info --opencl-devices --opencl-platforms --opencl-device-types --opencl-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-disable --hwmon-temp-abort --skip --limit --keyspace --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --increment --increment-min --increment-max --logfile-disable --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --stdout --keep-guessing --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --optimized-kernel-enable --self-test-disable  --slow-candidates --brain-server --brain-client --brain-client-features --brain-host --brain-port --brain-session --brain-session-whitelist --brain-password"
-  local OPTIONS="-m -a -t -o -p -c -d -w -n -u -j -k -r -g -1 -2 -3 -4 -s -l --hash-type --attack-mode --status-timer --markov-hcstat2 --markov-threshold --runtime --session --timer --outfile --outfile-format --outfile-check-timer --outfile-check-dir --separator --remove-timer --potfile-path --restore-file-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --opencl-devices --opencl-platforms --opencl-device-types --opencl-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-temp-abort --skip --limit --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --increment-min --increment-max --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --brain-client-features --brain-host --brain-password --brain-port --brain-session --brain-whitelist-session --stdin-timeout-abort"
+  local LONG_OPTS="--hash-type --attack-mode --version --help --quiet --benchmark --benchmark-all --hex-salt --hex-wordlist --hex-charset --force --status --status-json --status-timer --machine-readable --loopback --markov-hcstat2 --markov-disable --markov-classic --markov-threshold --runtime --session --speed-only --progress-only --restore --restore-file-path --restore-disable --outfile --outfile-format --outfile-autohex-disable --outfile-check-timer --outfile-check-dir --wordlist-autohex-disable --separator --show --left --username --remove --remove-timer --potfile-disable --potfile-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --example-hashes --backend-info --backend-devices --opencl-device-types --backend-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-disable --hwmon-temp-abort --skip --limit --keyspace --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --increment --increment-min --increment-max --logfile-disable --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --stdout --keep-guessing --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --optimized-kernel-enable --self-test-disable  --slow-candidates --brain-server --brain-client --brain-client-features --brain-host --brain-port --brain-session --brain-session-whitelist --brain-password"
+  local OPTIONS="-m -a -t -o -p -c -d -w -n -u -j -k -r -g -1 -2 -3 -4 -s -l --hash-type --attack-mode --status-timer --markov-hcstat2 --markov-threshold --runtime --session --timer --outfile --outfile-format --outfile-check-timer --outfile-check-dir --separator --remove-timer --potfile-path --restore-file-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --backend-devices --opencl-device-types --backend-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-temp-abort --skip --limit --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --increment-min --increment-max --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --brain-client-features --brain-host --brain-password --brain-port --brain-session --brain-whitelist-session --stdin-timeout-abort"
 
   COMPREPLY=()
   local cur="${COMP_WORDS[COMP_CWORD]}"
@@ -250,8 +250,8 @@ _hashcat ()
       return 0
       ;;
 
-     -d|--opencl-devices)
-      _hashcat_opencl_devices
+     -d|--backend-devices)
+      _hashcat_backend_devices
       local num_devices=${?}
 
       _hashcat_get_permutations ${num_devices}
@@ -265,36 +265,8 @@ _hashcat ()
       return 0
       ;;
 
-    --opencl-vector-width)
-      COMPREPLY=($(compgen -W "${OPENCL_VECTOR_WIDTH}" -- ${cur}))
-      return 0
-      ;;
-
-    --opencl-platforms)
-      local icd_list=$(ls -1 /etc/OpenCL/vendors/*.icd 2> /dev/null)
-
-      local architecture=$(getconf LONG_BIT 2> /dev/null)
-
-      if [ -z "${architecture}" ]; then
-        return 0
-      fi
-
-      # filter the icd_list (do not show 32 bit on 64bit systems and vice versa)
-
-      if [ "${architecture}" -eq 64 ]; then
-
-        icd_list=$(echo "${icd_list}" | grep -v "32.icd")
-
-      else
-
-        icd_list=$(echo "${icd_list}" | grep -v "64.icd")
-
-      fi
-
-      local number_icds=$(seq 1 $(echo "${icd_list}" | wc -l))
-
-      COMPREPLY=($(compgen -W "${number_icds}" -- ${cur}))
-
+    --backend-vector-width)
+      COMPREPLY=($(compgen -W "${BACKEND_VECTOR_WIDTH}" -- ${cur}))
       return 0
       ;;
 
@@ -471,13 +443,13 @@ _hashcat ()
       ;;
 
     -d*)
-      _hashcat_opencl_devices
+      _hashcat_backend_devices
       local num_devices=${?}
 
       _hashcat_get_permutations ${num_devices}
 
-      local opencl_devices_var="$(echo "  "${hashcat_devices_permutation} | sed 's/ / -d/g')"
-      COMPREPLY=($(compgen -W "${opencl_devices_var}" -- ${cur}))
+      local backend_devices_var="$(echo "  "${hashcat_devices_permutation} | sed 's/ / -d/g')"
+      COMPREPLY=($(compgen -W "${backend_devices_var}" -- ${cur}))
       return 0
       ;;
   esac
diff --git a/hashcat.hctune b/hashcat.hctune
index 3beea7ae3..f49236fce 100644
--- a/hashcat.hctune
+++ b/hashcat.hctune
@@ -228,7 +228,7 @@ TITAN_Xp                                        ALIAS_nv_sm50_or_higher
 TITAN_V                                         ALIAS_nv_sm50_or_higher
 TITAN_RTX                                       ALIAS_nv_sm50_or_higher
 
-Tegra_X1                                        ALIAS_nv_sm50_or_higher
+NVIDIA_Tegra_X1                                 ALIAS_nv_sm50_or_higher
 
 GeForce_910M                                    ALIAS_nv_sm50_or_higher
 GeForce_920M                                    ALIAS_nv_sm50_or_higher
@@ -312,10 +312,10 @@ ALIAS_nv_real_simd                              3       11900   2       A
 ALIAS_nv_real_simd                              3       13300   4       A       A
 ALIAS_nv_real_simd                              3       18700   8       A       A
 
-ALIAS_nv_sm50_or_higher                         3       0       4       A       A
-ALIAS_nv_sm50_or_higher                         3       10      4       A       A
-ALIAS_nv_sm50_or_higher                         3       11      4       A       A
-ALIAS_nv_sm50_or_higher                         3       12      4       A       A
+ALIAS_nv_sm50_or_higher                         3       0       8       A       A
+ALIAS_nv_sm50_or_higher                         3       10      8       A       A
+ALIAS_nv_sm50_or_higher                         3       11      8       A       A
+ALIAS_nv_sm50_or_higher                         3       12      8       A       A
 ALIAS_nv_sm50_or_higher                         3       20      4       A       A
 ALIAS_nv_sm50_or_higher                         3       21      4       A       A
 ALIAS_nv_sm50_or_higher                         3       22      4       A       A
@@ -323,18 +323,16 @@ ALIAS_nv_sm50_or_higher                         3       23      4       A
 ALIAS_nv_sm50_or_higher                         3       30      4       A       A
 ALIAS_nv_sm50_or_higher                         3       40      4       A       A
 ALIAS_nv_sm50_or_higher                         3       200     8       A       A
-ALIAS_nv_sm50_or_higher                         3       900     4       A       A
-ALIAS_nv_sm50_or_higher                         3       1000    4       A       A
-ALIAS_nv_sm50_or_higher                         3       1100    2       A       A
-ALIAS_nv_sm50_or_higher                         3       2400    4       A       A
+ALIAS_nv_sm50_or_higher                         3       900     8       A       A
+ALIAS_nv_sm50_or_higher                         3       1000    8       A       A
+ALIAS_nv_sm50_or_higher                         3       1100    4       A       A
+ALIAS_nv_sm50_or_higher                         3       2400    8       A       A
 ALIAS_nv_sm50_or_higher                         3       2410    4       A       A
 ALIAS_nv_sm50_or_higher                         3       3800    4       A       A
-ALIAS_nv_sm50_or_higher                         3       4800    4       A       A
-ALIAS_nv_sm50_or_higher                         3       5500    4       A       A
-ALIAS_nv_sm50_or_higher                         3       7300    2       A       A
-ALIAS_nv_sm50_or_higher                         3       8000    2       A       A
+ALIAS_nv_sm50_or_higher                         3       4800    8       A       A
+ALIAS_nv_sm50_or_higher                         3       5500    2       A       A
 ALIAS_nv_sm50_or_higher                         3       9900    4       A       A
-ALIAS_nv_sm50_or_higher                         3       16400   4       A       A
+ALIAS_nv_sm50_or_higher                         3       16400   8       A       A
 ALIAS_nv_sm50_or_higher                         3       18700   8       A       A
 
 ##
diff --git a/include/opencl.h b/include/backend.h
similarity index 52%
rename from include/opencl.h
rename to include/backend.h
index bd45111c2..e2ea51fb7 100644
--- a/include/opencl.h
+++ b/include/backend.h
@@ -3,8 +3,8 @@
  * License.....: MIT
  */
 
-#ifndef _OPENCL_H
-#define _OPENCL_H
+#ifndef _BACKEND_H
+#define _BACKEND_H
 
 #include <stdio.h>
 #include <errno.h>
@@ -22,8 +22,55 @@ static const char CL_VENDOR_MESA[]            = "Mesa";
 static const char CL_VENDOR_NV[]              = "NVIDIA Corporation";
 static const char CL_VENDOR_POCL[]            = "The pocl project";
 
-int  ocl_init  (hashcat_ctx_t *hashcat_ctx);
-void ocl_close (hashcat_ctx_t *hashcat_ctx);
+int  cuda_init   (hashcat_ctx_t *hashcat_ctx);
+void cuda_close  (hashcat_ctx_t *hashcat_ctx);
+
+int  nvrtc_init  (hashcat_ctx_t *hashcat_ctx);
+void nvrtc_close (hashcat_ctx_t *hashcat_ctx);
+
+int  ocl_init    (hashcat_ctx_t *hashcat_ctx);
+void ocl_close   (hashcat_ctx_t *hashcat_ctx);
+
+int hc_nvrtcCreateProgram        (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames);
+int hc_nvrtcDestroyProgram       (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog);
+int hc_nvrtcCompileProgram       (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, int numOptions, const char * const *options);
+int hc_nvrtcGetProgramLogSize    (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *logSizeRet);
+int hc_nvrtcGetProgramLog        (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *log);
+int hc_nvrtcGetPTXSize           (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *ptxSizeRet);
+int hc_nvrtcGetPTX               (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *ptx);
+
+int hc_cuCtxCreate               (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev);
+int hc_cuCtxDestroy              (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
+int hc_cuCtxSetCurrent           (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
+int hc_cuCtxSetCacheConfig       (hashcat_ctx_t *hashcat_ctx, CUfunc_cache config);
+int hc_cuCtxSynchronize          (hashcat_ctx_t *hashcat_ctx);
+int hc_cuDeviceGetAttribute      (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev);
+int hc_cuDeviceGetCount          (hashcat_ctx_t *hashcat_ctx, int *count);
+int hc_cuDeviceGet               (hashcat_ctx_t *hashcat_ctx, CUdevice *device, int ordinal);
+int hc_cuDeviceGetName           (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevice dev);
+int hc_cuDeviceTotalMem          (hashcat_ctx_t *hashcat_ctx, size_t *bytes, CUdevice dev);
+int hc_cuDriverGetVersion        (hashcat_ctx_t *hashcat_ctx, int *driverVersion);
+int hc_cuEventCreate             (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags);
+int hc_cuEventDestroy            (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
+int hc_cuEventElapsedTime        (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd);
+int hc_cuEventQuery              (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
+int hc_cuEventRecord             (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream);
+int hc_cuEventSynchronize        (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
+int hc_cuFuncGetAttribute        (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+int hc_cuFuncSetAttribute        (hashcat_ctx_t *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value);
+int hc_cuInit                    (hashcat_ctx_t *hashcat_ctx, unsigned int Flags);
+int hc_cuLaunchKernel            (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+int hc_cuMemAlloc                (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
+int hc_cuMemcpyDtoD              (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+int hc_cuMemcpyDtoH              (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+int hc_cuMemcpyHtoD              (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+int hc_cuMemFree                 (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr);
+int hc_cuModuleGetFunction       (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
+int hc_cuModuleLoadDataEx        (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+int hc_cuModuleUnload            (hashcat_ctx_t *hashcat_ctx, CUmodule hmod);
+int hc_cuStreamCreate            (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags);
+int hc_cuStreamDestroy           (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
+int hc_cuStreamSynchronize       (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
 
 int hc_clBuildProgram            (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data);
 int hc_clCreateBuffer            (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_mem *mem);
@@ -64,16 +111,21 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
 void rebuild_pws_compressed_append (hc_device_param_t *device_param, const u64 pws_cnt, const u8 chr);
 
-int run_kernel            (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num, const u32 event_update, const u32 iteration);
-int run_kernel_mp         (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num);
-int run_kernel_tm         (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param);
-int run_kernel_amp        (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num);
-int run_kernel_atinit     (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num);
-int run_kernel_memset     (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size);
-int run_kernel_bzero      (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size);
-int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num);
-int run_copy              (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt);
-int run_cracker           (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt);
+int run_cuda_kernel_atinit    (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num);
+int run_cuda_kernel_memset    (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u32 value, const u64 size);
+int run_cuda_kernel_bzero     (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 size);
+
+int run_opencl_kernel_atinit  (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num);
+int run_opencl_kernel_memset  (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size);
+int run_opencl_kernel_bzero   (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size);
+
+int run_kernel                (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num, const u32 event_update, const u32 iteration);
+int run_kernel_mp             (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num);
+int run_kernel_tm             (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param);
+int run_kernel_amp            (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num);
+int run_kernel_decompress     (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num);
+int run_copy                  (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt);
+int run_cracker               (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt);
 
 void generate_source_kernel_filename     (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *shared_dir, char *source_file);
 void generate_cached_kernel_filename     (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *profile_dir, const char *device_name_chksum, char *cached_file);
@@ -82,20 +134,20 @@ void generate_cached_kernel_mp_filename  (const u32 opti_type, const u64 opts_ty
 void generate_source_kernel_amp_filename (const u32 attack_kern, char *shared_dir, char *source_file);
 void generate_cached_kernel_amp_filename (const u32 attack_kern, char *profile_dir, const char *device_name_chksum, char *cached_file);
 
-int  opencl_ctx_init                  (hashcat_ctx_t *hashcat_ctx);
-void opencl_ctx_destroy               (hashcat_ctx_t *hashcat_ctx);
+int  backend_ctx_init                  (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_destroy               (hashcat_ctx_t *hashcat_ctx);
 
-int  opencl_ctx_devices_init          (hashcat_ctx_t *hashcat_ctx, const int comptime);
-void opencl_ctx_devices_destroy       (hashcat_ctx_t *hashcat_ctx);
-void opencl_ctx_devices_sync_tuning   (hashcat_ctx_t *hashcat_ctx);
-void opencl_ctx_devices_update_power  (hashcat_ctx_t *hashcat_ctx);
-void opencl_ctx_devices_kernel_loops  (hashcat_ctx_t *hashcat_ctx);
+int  backend_ctx_devices_init          (hashcat_ctx_t *hashcat_ctx, const int comptime);
+void backend_ctx_devices_destroy       (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_devices_sync_tuning   (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_devices_update_power  (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_devices_kernel_loops  (hashcat_ctx_t *hashcat_ctx);
 
-int  opencl_session_begin             (hashcat_ctx_t *hashcat_ctx);
-void opencl_session_destroy           (hashcat_ctx_t *hashcat_ctx);
-void opencl_session_reset             (hashcat_ctx_t *hashcat_ctx);
-int  opencl_session_update_combinator (hashcat_ctx_t *hashcat_ctx);
-int  opencl_session_update_mp         (hashcat_ctx_t *hashcat_ctx);
-int  opencl_session_update_mp_rl      (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_l, const u32 css_cnt_r);
+int  backend_session_begin             (hashcat_ctx_t *hashcat_ctx);
+void backend_session_destroy           (hashcat_ctx_t *hashcat_ctx);
+void backend_session_reset             (hashcat_ctx_t *hashcat_ctx);
+int  backend_session_update_combinator (hashcat_ctx_t *hashcat_ctx);
+int  backend_session_update_mp         (hashcat_ctx_t *hashcat_ctx);
+int  backend_session_update_mp_rl      (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_l, const u32 css_cnt_r);
 
-#endif // _OPENCL_H
+#endif // _BACKEND_H
diff --git a/include/ext_cuda.h b/include/ext_cuda.h
new file mode 100644
index 000000000..49257acbb
--- /dev/null
+++ b/include/ext_cuda.h
@@ -0,0 +1,1078 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#ifndef _EXT_CUDA_H
+#define _EXT_CUDA_H
+
+/**
+ * from cuda.h (/usr/local/cuda-10.1/targets/x86_64-linux/include/cuda.h)
+ */
+
+#define __CUDA_API_VERSION 10010
+
+/**
+ * CUDA device pointer
+ * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+ */
+#if __CUDA_API_VERSION >= 3020
+
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long CUdeviceptr;
+#else
+typedef unsigned int CUdeviceptr;
+#endif
+
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+typedef int CUdevice;                                     /**< CUDA device */
+typedef struct CUctx_st *CUcontext;                       /**< CUDA context */
+typedef struct CUevent_st *CUevent;                       /**< CUDA event */
+typedef struct CUfunc_st *CUfunction;                     /**< CUDA function */
+typedef struct CUmod_st *CUmodule;                        /**< CUDA module */
+typedef struct CUstream_st *CUstream;                     /**< CUDA stream */
+
+typedef enum cudaError_enum {
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * also means that the operation being queried is complete (see
+     * ::cuEventQuery() and ::cuStreamQuery()).
+     */
+    CUDA_SUCCESS                              = 0,
+
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    CUDA_ERROR_INVALID_VALUE                  = 1,
+
+    /**
+     * The API call failed because it was unable to allocate enough memory to
+     * perform the requested operation.
+     */
+    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
+
+    /**
+     * This indicates that the CUDA driver has not been initialized with
+     * ::cuInit() or that initialization has failed.
+     */
+    CUDA_ERROR_NOT_INITIALIZED                = 3,
+
+    /**
+     * This indicates that the CUDA driver is in the process of shutting down.
+     */
+    CUDA_ERROR_DEINITIALIZED                  = 4,
+
+    /**
+     * This indicates profiler is not initialized for this run. This can
+     * happen when the application is running with external profiling tools
+     * like visual profiler.
+     */
+    CUDA_ERROR_PROFILER_DISABLED              = 5,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to attempt to enable/disable the profiling via ::cuProfilerStart or
+     * ::cuProfilerStop without initialization.
+     */
+    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStart() when profiling is already enabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStop() when profiling is already disabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
+
+    /**
+     * This indicates that no CUDA-capable devices were detected by the installed
+     * CUDA driver.
+     */
+    CUDA_ERROR_NO_DEVICE                      = 100,
+
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device.
+     */
+    CUDA_ERROR_INVALID_DEVICE                 = 101,
+
+
+    /**
+     * This indicates that the device kernel image is invalid. This can also
+     * indicate an invalid CUDA module.
+     */
+    CUDA_ERROR_INVALID_IMAGE                  = 200,
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     */
+    CUDA_ERROR_INVALID_CONTEXT                = 201,
+
+    /**
+     * This indicated that the context being supplied as a parameter to the
+     * API call was already the active context.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.2. It is no longer an
+     * error to attempt to push the active context via ::cuCtxPushCurrent().
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
+
+    /**
+     * This indicates that a map or register operation has failed.
+     */
+    CUDA_ERROR_MAP_FAILED                     = 205,
+
+    /**
+     * This indicates that an unmap or unregister operation has failed.
+     */
+    CUDA_ERROR_UNMAP_FAILED                   = 206,
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    CUDA_ERROR_ALREADY_MAPPED                 = 208,
+
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    CUDA_ERROR_NOT_MAPPED                     = 211,
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
+
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
+
+    /**
+     * This indicates that the ::CUlimit passed to the API call is not
+     * supported by the active device.
+     */
+    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
+
+    /**
+     * This indicates that the ::CUcontext passed to the API call can
+     * only be bound to a single CPU thread at a time but is already
+     * bound to a CPU thread.
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
+
+    /**
+     * This indicates that peer access is not supported across the given
+     * devices.
+     */
+    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,
+
+    /**
+     * This indicates that a PTX JIT compilation failed.
+     */
+    CUDA_ERROR_INVALID_PTX                    = 218,
+
+    /**
+     * This indicates an error with OpenGL or DirectX context.
+     */
+    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,
+
+    /**
+    * This indicates that an uncorrectable NVLink error was detected during the
+    * execution.
+    */
+    CUDA_ERROR_NVLINK_UNCORRECTABLE           = 220,
+
+    /**
+    * This indicates that the PTX JIT compiler library was not found.
+    */
+    CUDA_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
+
+    /**
+     * This indicates that the device kernel source is invalid.
+     */
+    CUDA_ERROR_INVALID_SOURCE                 = 300,
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
+
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
+
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
+
+    /**
+     * This indicates that an OS call failed.
+     */
+    CUDA_ERROR_OPERATING_SYSTEM               = 304,
+
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
+     */
+    CUDA_ERROR_INVALID_HANDLE                 = 400,
+
+    /**
+     * This indicates that a resource required by the API call is not in a
+     * valid state to perform the requested operation.
+     */
+    CUDA_ERROR_ILLEGAL_STATE                  = 401,
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, texture names, and surface names.
+     */
+    CUDA_ERROR_NOT_FOUND                      = 500,
+
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
+     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
+     */
+    CUDA_ERROR_NOT_READY                      = 600,
+
+    /**
+     * While executing a kernel, the device encountered a
+     * load or store instruction on an invalid memory address.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_ADDRESS                = 700,
+
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. This error usually indicates that the user has
+     * attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register
+     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+     * when a 32-bit int is expected) is equivalent to passing too many
+     * arguments and can also result in this error.
+     */
+    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
+
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device attribute
+     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
+
+    /**
+     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
+     * trying to re-enable peer access to a context which has already
+     * had peer access to it enabled.
+     */
+    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,
+
+    /**
+     * This error indicates that ::cuCtxDisablePeerAccess() is
+     * trying to disable peer access which has not been enabled yet
+     * via ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,
+
+    /**
+     * This error indicates that the primary context for the specified device
+     * has already been initialized.
+     */
+    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
+
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
+
+    /**
+     * A device-side assert triggered during kernel execution. The context
+     * cannot be used anymore, and must be destroyed. All existing device
+     * memory allocations from this context are invalid and must be
+     * reconstructed if the program is to continue using CUDA.
+     */
+    CUDA_ERROR_ASSERT                         = 710,
+
+    /**
+     * This error indicates that the hardware resources required to enable
+     * peer access have been exhausted for one or more of the devices
+     * passed to ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_TOO_MANY_PEERS                 = 711,
+
+    /**
+     * This error indicates that the memory range passed to ::cuMemHostRegister()
+     * has already been registered.
+     */
+    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
+
+    /**
+     * This error indicates that the pointer passed to ::cuMemHostUnregister()
+     * does not correspond to any currently registered memory region.
+     */
+    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,
+
+    /**
+     * While executing a kernel, the device encountered a stack error.
+     * This can be due to stack corruption or exceeding the stack size limit.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_HARDWARE_STACK_ERROR           = 714,
+
+    /**
+     * While executing a kernel, the device encountered an illegal instruction.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_INSTRUCTION            = 715,
+
+    /**
+     * While executing a kernel, the device encountered a load or store instruction
+     * on a memory address which is not aligned.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_MISALIGNED_ADDRESS             = 716,
+
+    /**
+     * While executing a kernel, the device encountered an instruction
+     * which can only operate on memory locations in certain address spaces
+     * (global, shared, or local), but was supplied a memory address not
+     * belonging to an allowed address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_ADDRESS_SPACE          = 717,
+
+    /**
+     * While executing a kernel, the device program counter wrapped its address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_PC                     = 718,
+
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. Less common cases can be system specific - more
+     * information about these cases can be found in the system specific user guide.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_FAILED                  = 719,
+
+    /**
+     * This error indicates that the number of blocks launched per grid for a kernel that was
+     * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
+     * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
+     * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+     * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+     */
+    CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
+
+    /**
+     * This error indicates that the attempted operation is not permitted.
+     */
+    CUDA_ERROR_NOT_PERMITTED                  = 800,
+
+    /**
+     * This error indicates that the attempted operation is not supported
+     * on the current system or device.
+     */
+    CUDA_ERROR_NOT_SUPPORTED                  = 801,
+
+    /**
+     * This error indicates that the system is not yet ready to start any CUDA
+     * work.  To continue using CUDA, verify the system configuration is in a
+     * valid state and all required driver daemons are actively running.
+     * More information about this error can be found in the system specific
+     * user guide.
+     */
+    CUDA_ERROR_SYSTEM_NOT_READY               = 802,
+
+    /**
+     * This error indicates that there is a mismatch between the versions of
+     * the display driver and the CUDA driver. Refer to the compatibility documentation
+     * for supported versions.
+     */
+    CUDA_ERROR_SYSTEM_DRIVER_MISMATCH         = 803,
+
+    /**
+     * This error indicates that the system was upgraded to run with forward compatibility
+     * but the visible hardware detected by CUDA does not support this configuration.
+     * Refer to the compatibility documentation for the supported hardware matrix or ensure
+     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+     * environment variable.
+     */
+    CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
+
+    /**
+     * This error indicates that the operation is not permitted when
+     * the stream is capturing.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED     = 900,
+
+    /**
+     * This error indicates that the current capture sequence on the stream
+     * has been invalidated due to a previous error.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_INVALIDATED     = 901,
+
+    /**
+     * This error indicates that the operation would have resulted in a merge
+     * of two independent capture sequences.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_MERGE           = 902,
+
+    /**
+     * This error indicates that the capture was not initiated in this stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNMATCHED       = 903,
+
+    /**
+     * This error indicates that the capture sequence contains a fork that was
+     * not joined to the primary stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNJOINED        = 904,
+
+    /**
+     * This error indicates that a dependency would have been created which
+     * crosses the capture sequence boundary. Only implicit in-stream ordering
+     * dependencies are allowed to cross the boundary.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_ISOLATION       = 905,
+
+    /**
+     * This error indicates a disallowed implicit dependency on a current capture
+     * sequence from cudaStreamLegacy.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_IMPLICIT        = 906,
+
+    /**
+     * This error indicates that the operation is not permitted on an event which
+     * was last recorded in a capturing stream.
+     */
+    CUDA_ERROR_CAPTURED_EVENT                 = 907,
+
+    /**
+     * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
+     * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
+     * different thread.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD    = 908,
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    CUDA_ERROR_UNKNOWN                        = 999
+} CUresult;
+
+/**
+ * Online compiler and linker options
+ */
+typedef enum CUjit_option_enum
+{
+    /**
+     * Max number of registers that a thread may use.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_MAX_REGISTERS = 0,
+
+    /**
+     * IN: Specifies minimum number of threads per block to target compilation
+     * for\n
+     * OUT: Returns the number of threads the compiler actually targeted.
+     * This restricts the resource utilization fo the compiler (e.g. max
+     * registers) such that a block with the given number of threads should be
+     * able to launch based on register limitations. Note, this option does not
+     * currently take into account any other resource limitations, such as
+     * shared memory utilization.\n
+     * Cannot be combined with ::CU_JIT_TARGET.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_THREADS_PER_BLOCK,
+
+    /**
+     * Overwrites the option value with the total wall clock time, in
+     * milliseconds, spent in the compiler and linker\n
+     * Option type: float\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_WALL_TIME,
+
+    /**
+     * Pointer to a buffer in which to print any log messages
+     * that are informational in nature (the buffer size is specified via
+     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Pointer to a buffer in which to print any log messages that
+     * reflect errors (the buffer size is specified via option
+     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Level of optimizations to apply to generated code (0 - 4), with 4
+     * being the default and highest level of optimizations.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_OPTIMIZATION_LEVEL,
+
+    /**
+     * No option value required. Determines the target based on the current
+     * attached context (default)\n
+     * Option type: No option value needed\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET_FROM_CUCONTEXT,
+
+    /**
+     * Target is chosen based on supplied ::CUjit_target.  Cannot be
+     * combined with ::CU_JIT_THREADS_PER_BLOCK.\n
+     * Option type: unsigned int for enumerated type ::CUjit_target\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET,
+
+    /**
+     * Specifies choice of fallback strategy if matching cubin is not found.
+     * Choice is based on supplied ::CUjit_fallback.  This option cannot be
+     * used with cuLink* APIs as the linker requires exact matches.\n
+     * Option type: unsigned int for enumerated type ::CUjit_fallback\n
+     * Applies to: compiler only
+     */
+    CU_JIT_FALLBACK_STRATEGY,
+
+    /**
+     * Specifies whether to create debug information in output (-g)
+     * (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_GENERATE_DEBUG_INFO,
+
+    /**
+     * Generate verbose log messages (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_LOG_VERBOSE,
+
+    /**
+     * Generate line number information (-lineinfo) (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_GENERATE_LINE_INFO,
+
+    /**
+     * Specifies whether to enable caching explicitly (-dlcm) \n
+     * Choice is based on supplied ::CUjit_cacheMode_enum.\n
+     * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
+     * Applies to: compiler only
+     */
+    CU_JIT_CACHE_MODE,
+
+    /**
+     * The below jit options are used for internal purposes only, in this version of CUDA
+     */
+    CU_JIT_NEW_SM3X_OPT,
+    CU_JIT_FAST_COMPILE,
+
+    /**
+     * Array of device symbol names that will be relocated to the corresponing
+     * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * When loding a device module, driver will relocate all encountered
+     * unresolved symbols to the host addresses.\n
+     * It is only allowed to register symbols that correspond to unresolved
+     * global variables.\n
+     * It is illegal to register the same device symbol at multiple addresses.\n
+     * Option type: const char **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_NAMES,
+
+    /**
+     * Array of host addresses that will be used to relocate corresponding
+     * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * Option type: void **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
+
+    /**
+     * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
+     * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
+     * Option type: unsigned int\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_COUNT,
+
+    CU_JIT_NUM_OPTIONS
+
+} CUjit_option;
+
+/**
+ * Device properties
+ */
+typedef enum CUdevice_attribute_enum {
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                    /**< Maximum block dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                     /**< Maximum grid dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                     /**< Maximum grid dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                     /**< Maximum grid dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,        /**< Maximum shared memory available per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                         /**< Warp size in threads */
+    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                         /**< Maximum pitch in bytes allowed by memory copies */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,           /**< Maximum number of 32-bit registers available per block */
+    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                        /**< Typical clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                 /**< Alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                       /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
+    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,              /**< Number of multiprocessors on device */
+    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,               /**< Specifies whether there is a run time limit on kernels */
+    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                        /**< Device is integrated with host memory */
+    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,               /**< Device can map host memory into CUDA address space */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                      /**< Compute mode (See ::CUcomputemode for details) */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,           /**< Maximum 1D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,           /**< Maximum 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,          /**< Maximum 2D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,           /**< Maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,          /**< Maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,           /**< Maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,   /**< Maximum 2D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,  /**< Maximum 2D layered texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,  /**< Maximum layers in a 2D layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,     /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,    /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
+    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                 /**< Alignment requirement for surfaces */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                /**< Device can possibly execute multiple kernels concurrently */
+    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                       /**< Device has ECC support enabled */
+    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                        /**< PCI bus ID of the device */
+    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                     /**< PCI device ID of the device */
+    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                        /**< Device is using TCC driver model */
+    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,           /**< Global memory bus width in bits */
+    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                     /**< Size of L2 cache in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,    /**< Maximum resident threads per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                /**< Number of asynchronous engines */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                /**< Device shares a unified address space with the host */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,   /**< Maximum 1D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,  /**< Maximum layers in a 1D layered texture */
+    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                  /**< Deprecated, do not use. */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,    /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,   /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                     /**< PCI domain ID of the device */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,           /**< Pitch alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,      /**< Maximum cubemap texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,  /**< Maximum cubemap layered texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,           /**< Maximum 1D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,           /**< Maximum 2D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,          /**< Maximum 2D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,           /**< Maximum 3D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,          /**< Maximum 3D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,           /**< Maximum 3D surface depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,   /**< Maximum 1D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,  /**< Maximum layers in a 1D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,   /**< Maximum 2D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,  /**< Maximum 2D layered surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,  /**< Maximum layers in a 2D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,      /**< Maximum cubemap surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,  /**< Maximum cubemap layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,    /**< Maximum 1D linear texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,    /**< Maximum 2D linear texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,   /**< Maximum 2D linear texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,    /**< Maximum 2D linear texture pitch in bytes */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,          /**< Major compute capability version number */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,          /**< Minor compute capability version number */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */
+    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,       /**< Device supports stream priorities */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,         /**< Device supports caching globals in L1 */
+    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,          /**< Device supports caching locals in L1 */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,  /**< Maximum shared memory available per multiprocessor in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,  /**< Maximum number of 32-bit registers available per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                    /**< Device can allocate managed memory on this system */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                    /**< Device is on a multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,           /**< Unique id for a group of devices on the same multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,       /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
+    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,  /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,            /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,         /**< Device can coherently access managed memory concurrently with the CPU */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,      /**< Device supports compute preemption. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,            /**< ::cuStreamBatchMemOp and related APIs are supported. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,     /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,     /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,   /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */
+    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,           /**< Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
+    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,           /**< Device supports host memory registration via ::cudaHostRegister. */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
+    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */
+    CU_DEVICE_ATTRIBUTE_MAX
+} CUdevice_attribute;
+
+/**
+ * Function cache configurations
+ */
+typedef enum CUfunc_cache_enum {
+    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
+    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
+    CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
+    CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
+} CUfunc_cache;
+
+/**
+ * Shared memory configurations
+ */
+typedef enum CUsharedconfig_enum {
+    CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
+    CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
+    CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
+} CUsharedconfig;
+
+/**
+ * Function properties
+ */
+typedef enum CUfunction_attribute_enum {
+    /**
+     * The maximum number of threads per block, beyond which a launch of the
+     * function would fail. This number depends on both the function and the
+     * device on which the function is currently loaded.
+     */
+    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+
+    /**
+     * The size in bytes of statically-allocated shared memory required by
+     * this function. This does not include dynamically-allocated shared
+     * memory requested by the user at runtime.
+     */
+    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+
+    /**
+     * The size in bytes of user-allocated constant memory required by this
+     * function.
+     */
+    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+
+    /**
+     * The size in bytes of local memory used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+
+    /**
+     * The number of registers used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+
+    /**
+     * The PTX virtual architecture version for which the function was
+     * compiled. This value is the major PTX version * 10 + the minor PTX
+     * version, so a PTX version 1.3 function would return the value 13.
+     * Note that this may return the undefined value of 0 for cubins
+     * compiled prior to CUDA 3.0.
+     */
+    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+
+    /**
+     * The binary architecture version for which the function was compiled.
+     * This value is the major binary version * 10 + the minor binary version,
+     * so a binary version 1.3 function would return the value 13. Note that
+     * this will return a value of 10 for legacy cubins that do not have a
+     * properly-encoded binary architecture version.
+     */
+    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+
+    /**
+     * The attribute to indicate whether the function has been compiled with
+     * user specified option "-Xptxas --dlcm=ca" set .
+     */
+    CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
+
+    /**
+     * The maximum size in bytes of dynamically-allocated shared memory that can be used by
+     * this function. If the user-specified dynamic shared memory size is larger than this
+     * value, the launch will fail.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+
+    /**
+     * On devices where the L1 cache and shared memory use the same hardware resources,
+     * this sets the shared memory carveout preference, in percent of the total shared memory.
+     * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
+     * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+
+    CU_FUNC_ATTRIBUTE_MAX
+} CUfunction_attribute;
+
+/**
+ * Context creation flags
+ */
+typedef enum CUctx_flags_enum {
+    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
+    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
+    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
+    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+    CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling
+                                         *  \deprecated This flag was deprecated as of CUDA 4.0
+                                         *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
+    CU_CTX_SCHED_MASK          = 0x07,
+    CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
+    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
+    CU_CTX_FLAGS_MASK          = 0x1f
+} CUctx_flags;
+
+/**
+ * Stream creation flags
+ */
+typedef enum CUstream_flags_enum {
+    CU_STREAM_DEFAULT      = 0x0, /**< Default stream flag */
+    CU_STREAM_NON_BLOCKING = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+} CUstream_flags;
+
+/**
+ * Event creation flags
+ */
+typedef enum CUevent_flags_enum {
+    CU_EVENT_DEFAULT        = 0x0, /**< Default event flag */
+    CU_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
+    CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
+    CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
+} CUevent_flags;
+
+#ifdef _WIN32
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+#define CUDA_API_CALL CUDAAPI
+
+typedef CUresult (CUDA_API_CALL *CUDA_CUCTXCREATE)              (CUcontext *, unsigned int, CUdevice);
+typedef CUresult (CUDA_API_CALL *CUDA_CUCTXDESTROY)             (CUcontext);
+typedef CUresult (CUDA_API_CALL *CUDA_CUCTXGETCACHECONFIG)      (CUfunc_cache *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUCTXGETCURRENT)          (CUcontext *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUCTXGETSHAREDMEMCONFIG)  (CUsharedconfig *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUCTXPOPCURRENT)          (CUcontext *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUCTXPUSHCURRENT)         (CUcontext);
+typedef CUresult (CUDA_API_CALL *CUDA_CUCTXSETCACHECONFIG)      (CUfunc_cache);
+typedef CUresult (CUDA_API_CALL *CUDA_CUCTXSETCURRENT)          (CUcontext);
+typedef CUresult (CUDA_API_CALL *CUDA_CUCTXSETSHAREDMEMCONFIG)  (CUsharedconfig);
+typedef CUresult (CUDA_API_CALL *CUDA_CUCTXSYNCHRONIZE)         ();
+typedef CUresult (CUDA_API_CALL *CUDA_CUDEVICEGETATTRIBUTE)     (int *, CUdevice_attribute, CUdevice);
+typedef CUresult (CUDA_API_CALL *CUDA_CUDEVICEGETCOUNT)         (int *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUDEVICEGET)              (CUdevice *, int);
+typedef CUresult (CUDA_API_CALL *CUDA_CUDEVICEGETNAME)          (char *, int, CUdevice);
+typedef CUresult (CUDA_API_CALL *CUDA_CUDEVICETOTALMEM)         (size_t *, CUdevice);
+typedef CUresult (CUDA_API_CALL *CUDA_CUDRIVERGETVERSION)       (int *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUEVENTCREATE)            (CUevent *, unsigned int);
+typedef CUresult (CUDA_API_CALL *CUDA_CUEVENTDESTROY)           (CUevent);
+typedef CUresult (CUDA_API_CALL *CUDA_CUEVENTELAPSEDTIME)       (float *, CUevent, CUevent);
+typedef CUresult (CUDA_API_CALL *CUDA_CUEVENTQUERY)             (CUevent);
+typedef CUresult (CUDA_API_CALL *CUDA_CUEVENTRECORD)            (CUevent, CUstream);
+typedef CUresult (CUDA_API_CALL *CUDA_CUEVENTSYNCHRONIZE)       (CUevent);
+typedef CUresult (CUDA_API_CALL *CUDA_CUFUNCGETATTRIBUTE)       (int *, CUfunction_attribute, CUfunction);
+typedef CUresult (CUDA_API_CALL *CUDA_CUFUNCSETATTRIBUTE)       (CUfunction, CUfunction_attribute, int);
+typedef CUresult (CUDA_API_CALL *CUDA_CUFUNCSETCACHECONFIG)     (CUfunction, CUfunc_cache);
+typedef CUresult (CUDA_API_CALL *CUDA_CUFUNCSETSHAREDMEMCONFIG) (CUfunction, CUsharedconfig);
+typedef CUresult (CUDA_API_CALL *CUDA_CUGETERRORNAME)           (CUresult, const char **);
+typedef CUresult (CUDA_API_CALL *CUDA_CUGETERRORSTRING)         (CUresult, const char **);
+typedef CUresult (CUDA_API_CALL *CUDA_CUINIT)                   (unsigned int);
+typedef CUresult (CUDA_API_CALL *CUDA_CULAUNCHKERNEL)           (CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOC)               (CUdeviceptr *, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOCHOST)           (void **, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOD)             (CUdeviceptr, CUdeviceptr, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOH)             (void *, CUdeviceptr, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTOD)             (CUdeviceptr, const void *, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREE)                (CUdeviceptr);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREEHOST)            (void *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMGETINFO)             (size_t *, size_t *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32)              (CUdeviceptr, unsigned int, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8)               (CUdeviceptr, unsigned char, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMODULEGETFUNCTION)      (CUfunction *, CUmodule, const char *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMODULEGETGLOBAL)        (CUdeviceptr *, size_t *, CUmodule, const char *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMODULELOAD)             (CUmodule *, const char *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMODULELOADDATA)         (CUmodule *, const void *);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMODULELOADDATAEX)       (CUmodule *, const void *, unsigned int, CUjit_option *, void **);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMODULEUNLOAD)           (CUmodule);
+typedef CUresult (CUDA_API_CALL *CUDA_CUPROFILERSTART)          ();
+typedef CUresult (CUDA_API_CALL *CUDA_CUPROFILERSTOP)           ();
+typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMCREATE)           (CUstream *, unsigned int);
+typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMDESTROY)          (CUstream);
+typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMSYNCHRONIZE)      (CUstream);
+typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMWAITEVENT)        (CUstream, CUevent, unsigned int);
+
+typedef struct hc_cuda_lib
+{
+  hc_dynlib_t lib;
+
+  CUDA_CUCTXCREATE              cuCtxCreate;
+  CUDA_CUCTXDESTROY             cuCtxDestroy;
+  CUDA_CUCTXGETCACHECONFIG      cuCtxGetCacheConfig;
+  CUDA_CUCTXGETCURRENT          cuCtxGetCurrent;
+  CUDA_CUCTXGETSHAREDMEMCONFIG  cuCtxGetSharedMemConfig;
+  CUDA_CUCTXPOPCURRENT          cuCtxPopCurrent;
+  CUDA_CUCTXPUSHCURRENT         cuCtxPushCurrent;
+  CUDA_CUCTXSETCACHECONFIG      cuCtxSetCacheConfig;
+  CUDA_CUCTXSETCURRENT          cuCtxSetCurrent;
+  CUDA_CUCTXSETSHAREDMEMCONFIG  cuCtxSetSharedMemConfig;
+  CUDA_CUCTXSYNCHRONIZE         cuCtxSynchronize;
+  CUDA_CUDEVICEGETATTRIBUTE     cuDeviceGetAttribute;
+  CUDA_CUDEVICEGETCOUNT         cuDeviceGetCount;
+  CUDA_CUDEVICEGET              cuDeviceGet;
+  CUDA_CUDEVICEGETNAME          cuDeviceGetName;
+  CUDA_CUDEVICETOTALMEM         cuDeviceTotalMem;
+  CUDA_CUDRIVERGETVERSION       cuDriverGetVersion;
+  CUDA_CUEVENTCREATE            cuEventCreate;
+  CUDA_CUEVENTDESTROY           cuEventDestroy;
+  CUDA_CUEVENTELAPSEDTIME       cuEventElapsedTime;
+  CUDA_CUEVENTQUERY             cuEventQuery;
+  CUDA_CUEVENTRECORD            cuEventRecord;
+  CUDA_CUEVENTSYNCHRONIZE       cuEventSynchronize;
+  CUDA_CUFUNCGETATTRIBUTE       cuFuncGetAttribute;
+  CUDA_CUFUNCSETATTRIBUTE       cuFuncSetAttribute;
+  CUDA_CUFUNCSETCACHECONFIG     cuFuncSetCacheConfig;
+  CUDA_CUFUNCSETSHAREDMEMCONFIG cuFuncSetSharedMemConfig;
+  CUDA_CUGETERRORNAME           cuGetErrorName;
+  CUDA_CUGETERRORSTRING         cuGetErrorString;
+  CUDA_CUINIT                   cuInit;
+  CUDA_CULAUNCHKERNEL           cuLaunchKernel;
+  CUDA_CUMEMALLOC               cuMemAlloc;
+  CUDA_CUMEMALLOCHOST           cuMemAllocHost;
+  CUDA_CUMEMCPYDTOD             cuMemcpyDtoD;
+  CUDA_CUMEMCPYDTOH             cuMemcpyDtoH;
+  CUDA_CUMEMCPYHTOD             cuMemcpyHtoD;
+  CUDA_CUMEMFREE                cuMemFree;
+  CUDA_CUMEMFREEHOST            cuMemFreeHost;
+  CUDA_CUMEMGETINFO             cuMemGetInfo;
+  CUDA_CUMEMSETD32              cuMemsetD32;
+  CUDA_CUMEMSETD8               cuMemsetD8;
+  CUDA_CUMODULEGETFUNCTION      cuModuleGetFunction;
+  CUDA_CUMODULEGETGLOBAL        cuModuleGetGlobal;
+  CUDA_CUMODULELOAD             cuModuleLoad;
+  CUDA_CUMODULELOADDATA         cuModuleLoadData;
+  CUDA_CUMODULELOADDATAEX       cuModuleLoadDataEx;
+  CUDA_CUMODULEUNLOAD           cuModuleUnload;
+  CUDA_CUPROFILERSTART          cuProfilerStart;
+  CUDA_CUPROFILERSTOP           cuProfilerStop;
+  CUDA_CUSTREAMCREATE           cuStreamCreate;
+  CUDA_CUSTREAMDESTROY          cuStreamDestroy;
+  CUDA_CUSTREAMSYNCHRONIZE      cuStreamSynchronize;
+  CUDA_CUSTREAMWAITEVENT        cuStreamWaitEvent;
+
+} hc_cuda_lib_t;
+
+typedef hc_cuda_lib_t CUDA_PTR;
+
+#endif // _EXT_CUDA_H
diff --git a/include/ext_nvrtc.h b/include/ext_nvrtc.h
new file mode 100644
index 000000000..7bbbbd15a
--- /dev/null
+++ b/include/ext_nvrtc.h
@@ -0,0 +1,85 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#ifndef _EXT_NVRTC_H
+#define _EXT_NVRTC_H
+
+/**
+ * from cuda.h (/usr/local/cuda-10.1/targets/x86_64-linux/include/nvrtc.h)
+ */
+
+/**
+ * \ingroup error
+ * \brief   The enumerated type nvrtcResult defines API call result codes.
+ *          NVRTC API functions return nvrtcResult to indicate the call
+ *          result.
+ */
+typedef enum {
+  NVRTC_SUCCESS = 0,
+  NVRTC_ERROR_OUT_OF_MEMORY = 1,
+  NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  NVRTC_ERROR_INVALID_INPUT = 3,
+  NVRTC_ERROR_INVALID_PROGRAM = 4,
+  NVRTC_ERROR_INVALID_OPTION = 5,
+  NVRTC_ERROR_COMPILATION = 6,
+  NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  NVRTC_ERROR_INTERNAL_ERROR = 11
+} nvrtcResult;
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcProgram is the unit of compilation, and an opaque handle for
+ *          a program.
+ *
+ * To compile a CUDA program string, an instance of nvrtcProgram must be
+ * created first with ::nvrtcCreateProgram, then compiled with
+ * ::nvrtcCompileProgram.
+ */
+typedef struct _nvrtcProgram *nvrtcProgram;
+
+#ifdef _WIN32
+#define NVRTCAPI __stdcall
+#else
+#define NVRTCAPI
+#endif
+
+#define NVRTC_API_CALL NVRTCAPI
+
+typedef nvrtcResult  (NVRTC_API_CALL *NVRTC_NVRTCADDNAMEEXPRESSION)  (nvrtcProgram, const char * const);
+typedef nvrtcResult  (NVRTC_API_CALL *NVRTC_NVRTCCOMPILEPROGRAM)     (nvrtcProgram, int, const char * const *);
+typedef nvrtcResult  (NVRTC_API_CALL *NVRTC_NVRTCCREATEPROGRAM)      (nvrtcProgram *, const char *, const char *, int, const char * const *, const char * const *);
+typedef nvrtcResult  (NVRTC_API_CALL *NVRTC_NVRTCDESTROYPROGRAM)     (nvrtcProgram *);
+typedef nvrtcResult  (NVRTC_API_CALL *NVRTC_NVRTCGETLOWEREDNAME)     (nvrtcProgram, const char * const, const char **);
+typedef nvrtcResult  (NVRTC_API_CALL *NVRTC_NVRTCGETPTX)             (nvrtcProgram, char *);
+typedef nvrtcResult  (NVRTC_API_CALL *NVRTC_NVRTCGETPTXSIZE)         (nvrtcProgram, size_t *);
+typedef nvrtcResult  (NVRTC_API_CALL *NVRTC_NVRTCGETPROGRAMLOG)      (nvrtcProgram, char *);
+typedef nvrtcResult  (NVRTC_API_CALL *NVRTC_NVRTCGETPROGRAMLOGSIZE)  (nvrtcProgram, size_t *);
+typedef const char * (NVRTC_API_CALL *NVRTC_NVRTCGETERRORSTRING)     (nvrtcResult);
+
+typedef struct hc_nvrtc_lib
+{
+  hc_dynlib_t lib;
+
+  NVRTC_NVRTCADDNAMEEXPRESSION  nvrtcAddNameExpression;
+  NVRTC_NVRTCCOMPILEPROGRAM     nvrtcCompileProgram;
+  NVRTC_NVRTCCREATEPROGRAM      nvrtcCreateProgram;
+  NVRTC_NVRTCDESTROYPROGRAM     nvrtcDestroyProgram;
+  NVRTC_NVRTCGETLOWEREDNAME     nvrtcGetLoweredName;
+  NVRTC_NVRTCGETPTX             nvrtcGetPTX;
+  NVRTC_NVRTCGETPTXSIZE         nvrtcGetPTXSize;
+  NVRTC_NVRTCGETPROGRAMLOG      nvrtcGetProgramLog;
+  NVRTC_NVRTCGETPROGRAMLOGSIZE  nvrtcGetProgramLogSize;
+  NVRTC_NVRTCGETERRORSTRING     nvrtcGetErrorString;
+
+} hc_nvrtc_lib_t;
+
+typedef hc_nvrtc_lib_t NVRTC_PTR;
+
+int nvrtc_make_options_array_from_string (char *string, char **options);
+
+#endif // _EXT_NVRTC_H
diff --git a/include/hwmon.h b/include/hwmon.h
index e3f64ec7d..7976cd9ac 100644
--- a/include/hwmon.h
+++ b/include/hwmon.h
@@ -11,16 +11,16 @@
 #ifndef _HWMON_H
 #define _HWMON_H
 
-int hm_get_threshold_slowdown_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id);
-int hm_get_threshold_shutdown_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id);
-int hm_get_temperature_with_device_id        (hashcat_ctx_t *hashcat_ctx, const u32 device_id);
-int hm_get_fanpolicy_with_device_id          (hashcat_ctx_t *hashcat_ctx, const u32 device_id);
-int hm_get_fanspeed_with_device_id           (hashcat_ctx_t *hashcat_ctx, const u32 device_id);
-int hm_get_buslanes_with_device_id           (hashcat_ctx_t *hashcat_ctx, const u32 device_id);
-int hm_get_utilization_with_device_id        (hashcat_ctx_t *hashcat_ctx, const u32 device_id);
-int hm_get_memoryspeed_with_device_id        (hashcat_ctx_t *hashcat_ctx, const u32 device_id);
-int hm_get_corespeed_with_device_id          (hashcat_ctx_t *hashcat_ctx, const u32 device_id);
-int hm_get_throttle_with_device_id           (hashcat_ctx_t *hashcat_ctx, const u32 device_id);
+int hm_get_threshold_slowdown_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+int hm_get_threshold_shutdown_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+int hm_get_temperature_with_devices_idx        (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+int hm_get_fanpolicy_with_devices_idx          (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+int hm_get_fanspeed_with_devices_idx           (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+int hm_get_buslanes_with_devices_idx           (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+int hm_get_utilization_with_devices_idx        (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+int hm_get_memoryspeed_with_devices_idx        (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+int hm_get_corespeed_with_devices_idx          (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+int hm_get_throttle_with_devices_idx           (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
 
 int  hwmon_ctx_init    (hashcat_ctx_t *hashcat_ctx);
 void hwmon_ctx_destroy (hashcat_ctx_t *hashcat_ctx);
diff --git a/include/status.h b/include/status.h
index 15efe5d01..2727d6172 100644
--- a/include/status.h
+++ b/include/status.h
@@ -19,8 +19,8 @@ void format_speed_display_1k (double val,    char *buf, size_t len);
 
 int         status_get_device_info_cnt                (const hashcat_ctx_t *hashcat_ctx);
 int         status_get_device_info_active             (const hashcat_ctx_t *hashcat_ctx);
-bool        status_get_skipped_dev                    (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-bool        status_get_skipped_warning_dev            (const hashcat_ctx_t *hashcat_ctx, const int device_id);
+bool        status_get_skipped_dev                    (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+bool        status_get_skipped_warning_dev            (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
 char       *status_get_session                        (const hashcat_ctx_t *hashcat_ctx);
 const char *status_get_status_string                  (const hashcat_ctx_t *hashcat_ctx);
 int         status_get_status_number                  (const hashcat_ctx_t *hashcat_ctx);
@@ -35,7 +35,7 @@ int         status_get_guess_mod_count                (const hashcat_ctx_t *hash
 double      status_get_guess_mod_percent              (const hashcat_ctx_t *hashcat_ctx);
 char       *status_get_guess_charset                  (const hashcat_ctx_t *hashcat_ctx);
 int         status_get_guess_mask_length              (const hashcat_ctx_t *hashcat_ctx);
-char       *status_get_guess_candidates_dev           (const hashcat_ctx_t *hashcat_ctx, const int device_id);
+char       *status_get_guess_candidates_dev           (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
 char       *status_get_hash_name                      (const hashcat_ctx_t *hashcat_ctx);
 char       *status_get_hash_target                    (const hashcat_ctx_t *hashcat_ctx);
 int         status_get_digests_done                   (const hashcat_ctx_t *hashcat_ctx);
@@ -68,12 +68,12 @@ u64         status_get_progress_skip                  (const hashcat_ctx_t *hash
 u64         status_get_progress_cur_relative_skip     (const hashcat_ctx_t *hashcat_ctx);
 u64         status_get_progress_end_relative_skip     (const hashcat_ctx_t *hashcat_ctx);
 double      status_get_hashes_msec_all                (const hashcat_ctx_t *hashcat_ctx);
-double      status_get_hashes_msec_dev                (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-double      status_get_hashes_msec_dev_benchmark      (const hashcat_ctx_t *hashcat_ctx, const int device_id);
+double      status_get_hashes_msec_dev                (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+double      status_get_hashes_msec_dev_benchmark      (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
 double      status_get_exec_msec_all                  (const hashcat_ctx_t *hashcat_ctx);
-double      status_get_exec_msec_dev                  (const hashcat_ctx_t *hashcat_ctx, const int device_id);
+double      status_get_exec_msec_dev                  (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
 char       *status_get_speed_sec_all                  (const hashcat_ctx_t *hashcat_ctx);
-char       *status_get_speed_sec_dev                  (const hashcat_ctx_t *hashcat_ctx, const int device_id);
+char       *status_get_speed_sec_dev                  (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
 int         status_get_cpt_cur_min                    (const hashcat_ctx_t *hashcat_ctx);
 int         status_get_cpt_cur_hour                   (const hashcat_ctx_t *hashcat_ctx);
 int         status_get_cpt_cur_day                    (const hashcat_ctx_t *hashcat_ctx);
@@ -81,30 +81,30 @@ int         status_get_cpt_avg_min                    (const hashcat_ctx_t *hash
 int         status_get_cpt_avg_hour                   (const hashcat_ctx_t *hashcat_ctx);
 int         status_get_cpt_avg_day                    (const hashcat_ctx_t *hashcat_ctx);
 char       *status_get_cpt                            (const hashcat_ctx_t *hashcat_ctx);
-int         status_get_salt_pos_dev                   (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-int         status_get_innerloop_pos_dev              (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-int         status_get_innerloop_left_dev             (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-int         status_get_iteration_pos_dev              (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-int         status_get_iteration_left_dev             (const hashcat_ctx_t *hashcat_ctx, const int device_id);
+int         status_get_salt_pos_dev                   (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+int         status_get_innerloop_pos_dev              (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+int         status_get_innerloop_left_dev             (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+int         status_get_iteration_pos_dev              (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+int         status_get_iteration_left_dev             (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
 #ifdef WITH_BRAIN
 int         status_get_brain_session                  (const hashcat_ctx_t *hashcat_ctx);
 int         status_get_brain_attack                   (const hashcat_ctx_t *hashcat_ctx);
-int         status_get_brain_link_client_id_dev       (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-int         status_get_brain_link_status_dev          (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-char       *status_get_brain_link_recv_bytes_dev      (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-char       *status_get_brain_link_send_bytes_dev      (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-char       *status_get_brain_link_recv_bytes_sec_dev  (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-char       *status_get_brain_link_send_bytes_sec_dev  (const hashcat_ctx_t *hashcat_ctx, const int device_id);
+int         status_get_brain_link_client_id_dev       (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+int         status_get_brain_link_status_dev          (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+char       *status_get_brain_link_recv_bytes_dev      (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+char       *status_get_brain_link_send_bytes_dev      (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+char       *status_get_brain_link_recv_bytes_sec_dev  (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+char       *status_get_brain_link_send_bytes_sec_dev  (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
 #endif
-char       *status_get_hwmon_dev                      (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-int         status_get_corespeed_dev                  (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-int         status_get_memoryspeed_dev                (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-u64         status_get_progress_dev                   (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-double      status_get_runtime_msec_dev               (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-int         status_get_kernel_accel_dev               (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-int         status_get_kernel_loops_dev               (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-int         status_get_kernel_threads_dev             (const hashcat_ctx_t *hashcat_ctx, const int device_id);
-int         status_get_vector_width_dev               (const hashcat_ctx_t *hashcat_ctx, const int device_id);
+char       *status_get_hwmon_dev                      (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+int         status_get_corespeed_dev                  (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+int         status_get_memoryspeed_dev                (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+u64         status_get_progress_dev                   (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+double      status_get_runtime_msec_dev               (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+int         status_get_kernel_accel_dev               (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+int         status_get_kernel_loops_dev               (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+int         status_get_kernel_threads_dev             (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
+int         status_get_vector_width_dev               (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx);
 
 int         status_progress_init                      (hashcat_ctx_t *hashcat_ctx);
 void        status_progress_destroy                   (hashcat_ctx_t *hashcat_ctx);
diff --git a/include/terminal.h b/include/terminal.h
index 1687b9a9e..e15d4ef9c 100644
--- a/include/terminal.h
+++ b/include/terminal.h
@@ -43,8 +43,8 @@ void compress_terminal_line_length (char *out_buf, const size_t keep_from_beginn
 
 void example_hashes                     (hashcat_ctx_t *hashcat_ctx);
 
-void opencl_info                        (hashcat_ctx_t *hashcat_ctx);
-void opencl_info_compact                (hashcat_ctx_t *hashcat_ctx);
+void backend_info                       (hashcat_ctx_t *hashcat_ctx);
+void backend_info_compact               (hashcat_ctx_t *hashcat_ctx);
 
 void status_progress_machine_readable   (hashcat_ctx_t *hashcat_ctx);
 void status_progress                    (hashcat_ctx_t *hashcat_ctx);
diff --git a/include/thread.h b/include/thread.h
index d9e594a06..8f06d0e29 100644
--- a/include/thread.h
+++ b/include/thread.h
@@ -18,7 +18,7 @@
 #if defined (_WIN)
 
 #define hc_thread_create(t,f,a)     t = CreateThread (NULL, 0, (LPTHREAD_START_ROUTINE) &f, a, 0, NULL)
-#define hc_thread_wait(n,a)         for (u32 i = 0; i < n; i++) WaitForSingleObject ((a)[i], INFINITE)
+#define hc_thread_wait(n,a)         for (int i = 0; i < n; i++) WaitForSingleObject ((a)[i], INFINITE)
 #define hc_thread_exit(t)           ExitThread (t)
 #define hc_thread_detach(t)         CloseHandle (t)
 
@@ -42,7 +42,7 @@
 #else
 
 #define hc_thread_create(t,f,a)     pthread_create (&t, NULL, f, a)
-#define hc_thread_wait(n,a)         for (u32 i = 0; i < n; i++) pthread_join ((a)[i], NULL)
+#define hc_thread_wait(n,a)         for (int i = 0; i < n; i++) pthread_join ((a)[i], NULL)
 #define hc_thread_exit(t)           pthread_exit (&t)
 #define hc_thread_detach(t)         pthread_detach (t)
 
diff --git a/include/types.h b/include/types.h
index 7c1960147..d4bcd581d 100644
--- a/include/types.h
+++ b/include/types.h
@@ -44,13 +44,6 @@ typedef int16_t i16;
 typedef int32_t i32;
 typedef int64_t i64;
 
-// import types from opencl
-
-//typedef uint8_t  uchar;
-//typedef uint16_t ushort;
-//typedef uint32_t uint;
-//typedef uint64_t ulong;
-
 #include "inc_types.h"
 
 // there's no such thing in plain C, therefore all vector operation cannot work in this emu
@@ -76,13 +69,13 @@ typedef struct timespec   hc_timer_t;
 #endif
 
 #if defined (_WIN)
-typedef HANDLE              hc_thread_t;
-typedef HANDLE              hc_thread_mutex_t;
-typedef HANDLE              hc_thread_semaphore_t;
+typedef HANDLE          hc_thread_t;
+typedef HANDLE          hc_thread_mutex_t;
+typedef HANDLE          hc_thread_semaphore_t;
 #else
-typedef pthread_t           hc_thread_t;
-typedef pthread_mutex_t     hc_thread_mutex_t;
-typedef sem_t               hc_thread_semaphore_t;
+typedef pthread_t       hc_thread_t;
+typedef pthread_mutex_t hc_thread_mutex_t;
+typedef sem_t           hc_thread_semaphore_t;
 #endif
 
 // enums
@@ -133,10 +126,11 @@ typedef enum event_identifier
   EVENT_MONITOR_PERFORMANCE_HINT  = 0x00000086,
   EVENT_MONITOR_NOINPUT_HINT      = 0x00000087,
   EVENT_MONITOR_NOINPUT_ABORT     = 0x00000088,
-  EVENT_OPENCL_SESSION_POST       = 0x00000090,
-  EVENT_OPENCL_SESSION_PRE        = 0x00000091,
-  EVENT_OPENCL_DEVICE_INIT_POST   = 0x00000092,
-  EVENT_OPENCL_DEVICE_INIT_PRE    = 0x00000093,
+  EVENT_BACKEND_SESSION_POST      = 0x00000090,
+  EVENT_BACKEND_SESSION_PRE       = 0x00000091,
+  EVENT_BACKEND_SESSION_HOSTMEM   = 0x00000092,
+  EVENT_BACKEND_DEVICE_INIT_POST  = 0x00000093,
+  EVENT_BACKEND_DEVICE_INIT_PRE   = 0x00000094,
   EVENT_OUTERLOOP_FINISHED        = 0x000000a0,
   EVENT_OUTERLOOP_MAINSCREEN      = 0x000000a1,
   EVENT_OUTERLOOP_STARTING        = 0x000000a2,
@@ -592,8 +586,8 @@ typedef enum user_options_defaults
   MARKOV_DISABLE           = false,
   MARKOV_THRESHOLD         = 0,
   NONCE_ERROR_CORRECTIONS  = 8,
-  OPENCL_INFO              = false,
-  OPENCL_VECTOR_WIDTH      = 0,
+  BACKEND_INFO             = false,
+  BACKEND_VECTOR_WIDTH     = 0,
   OPTIMIZED_KERNEL_ENABLE  = false,
   OUTFILE_AUTOHEX          = true,
   OUTFILE_CHECK_TIMER      = 5,
@@ -637,6 +631,9 @@ typedef enum user_options_map
 {
   IDX_ADVICE_DISABLE            = 0xff00,
   IDX_ATTACK_MODE               = 'a',
+  IDX_BACKEND_DEVICES           = 'd',
+  IDX_BACKEND_INFO              = 'I',
+  IDX_BACKEND_VECTOR_WIDTH      = 0xff27,
   IDX_BENCHMARK_ALL             = 0xff01,
   IDX_BENCHMARK                 = 'b',
   IDX_BITMAP_MAX                = 0xff02,
@@ -690,11 +687,7 @@ typedef enum user_options_map
   IDX_MARKOV_HCSTAT2            = 0xff24,
   IDX_MARKOV_THRESHOLD          = 't',
   IDX_NONCE_ERROR_CORRECTIONS   = 0xff25,
-  IDX_OPENCL_DEVICES            = 'd',
   IDX_OPENCL_DEVICE_TYPES       = 'D',
-  IDX_OPENCL_INFO               = 'I',
-  IDX_OPENCL_PLATFORMS          = 0xff26,
-  IDX_OPENCL_VECTOR_WIDTH       = 0xff27,
   IDX_OPTIMIZED_KERNEL_ENABLE   = 'O',
   IDX_OUTFILE_AUTOHEX_DISABLE   = 0xff28,
   IDX_OUTFILE_CHECK_DIR         = 0xff29,
@@ -989,29 +982,25 @@ typedef struct link_speed
 
 } link_speed_t;
 
+#include "ext_nvrtc.h"
+#include "ext_cuda.h"
 #include "ext_OpenCL.h"
 
 typedef struct hc_device_param
 {
-  cl_device_id    device;
-  cl_device_type  device_type;
+  int     device_id;
 
-  u32     device_id;
-  u32     platform_devices_id;  // for mapping with hms devices
-
-  bool    skipped;              // permanent
-  bool    skipped_warning;      // iteration
-
-  st_status_t st_status;
-
-  u32     sm_major;
-  u32     sm_minor;
-  u32     kernel_exec_timeout;
+  // this occurs if the same device (pci address) is used by multiple backend API
+  int     device_id_alias_cnt;
+  int     device_id_alias_buf[DEVICES_MAX];
 
   u8      pcie_bus;
   u8      pcie_device;
   u8      pcie_function;
 
+  bool    skipped;              // permanent
+  bool    skipped_warning;      // iteration
+
   u32     device_processors;
   u64     device_maxmem_alloc;
   u64     device_global_mem;
@@ -1019,9 +1008,16 @@ typedef struct hc_device_param
   u32     device_maxclock_frequency;
   size_t  device_maxworkgroup_size;
   u64     device_local_mem_size;
-  cl_device_local_mem_type device_local_mem_type;
+  int     device_local_mem_type;
+  char   *device_name;
 
-  u32     vector_width;
+  int     sm_major;
+  int     sm_minor;
+  u32     kernel_exec_timeout;
+
+  st_status_t st_status;
+
+  int     vector_width;
 
   u32     kernel_wgs1;
   u32     kernel_wgs12;
@@ -1198,14 +1194,6 @@ typedef struct hc_device_param
 
   hc_timer_t timer_speed;
 
-  // device specific attributes starting
-
-  char   *device_name;
-  char   *device_vendor;
-  char   *device_version;
-  char   *driver_version;
-  char   *device_opencl_version;
-
   // AMD
   bool    has_vadd3;
   bool    has_vbfe;
@@ -1219,79 +1207,6 @@ typedef struct hc_device_param
 
   double  spin_damp;
 
-  cl_platform_id platform;
-
-  cl_uint  device_vendor_id;
-  cl_uint  platform_vendor_id;
-
-  cl_kernel  kernel1;
-  cl_kernel  kernel12;
-  cl_kernel  kernel2;
-  cl_kernel  kernel23;
-  cl_kernel  kernel3;
-  cl_kernel  kernel4;
-  cl_kernel  kernel_init2;
-  cl_kernel  kernel_loop2;
-  cl_kernel  kernel_mp;
-  cl_kernel  kernel_mp_l;
-  cl_kernel  kernel_mp_r;
-  cl_kernel  kernel_amp;
-  cl_kernel  kernel_tm;
-  cl_kernel  kernel_memset;
-  cl_kernel  kernel_atinit;
-  cl_kernel  kernel_decompress;
-  cl_kernel  kernel_aux1;
-  cl_kernel  kernel_aux2;
-  cl_kernel  kernel_aux3;
-  cl_kernel  kernel_aux4;
-
-  cl_context context;
-
-  cl_program program;
-  cl_program program_mp;
-  cl_program program_amp;
-
-  cl_command_queue command_queue;
-
-  cl_mem  d_pws_buf;
-  cl_mem  d_pws_amp_buf;
-  cl_mem  d_pws_comp_buf;
-  cl_mem  d_pws_idx;
-  cl_mem  d_words_buf_l;
-  cl_mem  d_words_buf_r;
-  cl_mem  d_rules;
-  cl_mem  d_rules_c;
-  cl_mem  d_combs;
-  cl_mem  d_combs_c;
-  cl_mem  d_bfs;
-  cl_mem  d_bfs_c;
-  cl_mem  d_tm_c;
-  cl_mem  d_bitmap_s1_a;
-  cl_mem  d_bitmap_s1_b;
-  cl_mem  d_bitmap_s1_c;
-  cl_mem  d_bitmap_s1_d;
-  cl_mem  d_bitmap_s2_a;
-  cl_mem  d_bitmap_s2_b;
-  cl_mem  d_bitmap_s2_c;
-  cl_mem  d_bitmap_s2_d;
-  cl_mem  d_plain_bufs;
-  cl_mem  d_digests_buf;
-  cl_mem  d_digests_shown;
-  cl_mem  d_salt_bufs;
-  cl_mem  d_esalt_bufs;
-  cl_mem  d_tmps;
-  cl_mem  d_hooks;
-  cl_mem  d_result;
-  cl_mem  d_extra0_buf;
-  cl_mem  d_extra1_buf;
-  cl_mem  d_extra2_buf;
-  cl_mem  d_extra3_buf;
-  cl_mem  d_root_css_buf;
-  cl_mem  d_markov_css_buf;
-  cl_mem  d_st_digests_buf;
-  cl_mem  d_st_salts_buf;
-  cl_mem  d_st_esalts_buf;
-
   void   *kernel_params[PARAMCNT];
   void   *kernel_params_mp[PARAMCNT];
   void   *kernel_params_mp_r[PARAMCNT];
@@ -1326,26 +1241,186 @@ typedef struct hc_device_param
   u32     kernel_params_decompress_buf32[PARAMCNT];
   u64     kernel_params_decompress_buf64[PARAMCNT];
 
+  // API: cuda
+
+  bool              is_cuda;
+
+  int               cuda_warp_size;
+
+  CUdevice          cuda_device;
+  CUcontext         cuda_context;
+  CUstream          cuda_stream;
+
+  CUevent           cuda_event1;
+  CUevent           cuda_event2;
+
+  CUmodule          cuda_module;
+  CUmodule          cuda_module_mp;
+  CUmodule          cuda_module_amp;
+
+  CUfunction        cuda_function1;
+  CUfunction        cuda_function12;
+  CUfunction        cuda_function2;
+  CUfunction        cuda_function23;
+  CUfunction        cuda_function3;
+  CUfunction        cuda_function4;
+  CUfunction        cuda_function_init2;
+  CUfunction        cuda_function_loop2;
+  CUfunction        cuda_function_mp;
+  CUfunction        cuda_function_mp_l;
+  CUfunction        cuda_function_mp_r;
+  CUfunction        cuda_function_amp;
+  CUfunction        cuda_function_tm;
+  CUfunction        cuda_function_memset;
+  CUfunction        cuda_function_atinit;
+  CUfunction        cuda_function_decompress;
+  CUfunction        cuda_function_aux1;
+  CUfunction        cuda_function_aux2;
+  CUfunction        cuda_function_aux3;
+  CUfunction        cuda_function_aux4;
+
+  CUdeviceptr       cuda_d_pws_buf;
+  CUdeviceptr       cuda_d_pws_amp_buf;
+  CUdeviceptr       cuda_d_pws_comp_buf;
+  CUdeviceptr       cuda_d_pws_idx;
+  CUdeviceptr       cuda_d_words_buf_l;
+  CUdeviceptr       cuda_d_words_buf_r;
+  CUdeviceptr       cuda_d_rules;
+  CUdeviceptr       cuda_d_rules_c;
+  CUdeviceptr       cuda_d_combs;
+  CUdeviceptr       cuda_d_combs_c;
+  CUdeviceptr       cuda_d_bfs;
+  CUdeviceptr       cuda_d_bfs_c;
+  CUdeviceptr       cuda_d_tm_c;
+  CUdeviceptr       cuda_d_bitmap_s1_a;
+  CUdeviceptr       cuda_d_bitmap_s1_b;
+  CUdeviceptr       cuda_d_bitmap_s1_c;
+  CUdeviceptr       cuda_d_bitmap_s1_d;
+  CUdeviceptr       cuda_d_bitmap_s2_a;
+  CUdeviceptr       cuda_d_bitmap_s2_b;
+  CUdeviceptr       cuda_d_bitmap_s2_c;
+  CUdeviceptr       cuda_d_bitmap_s2_d;
+  CUdeviceptr       cuda_d_plain_bufs;
+  CUdeviceptr       cuda_d_digests_buf;
+  CUdeviceptr       cuda_d_digests_shown;
+  CUdeviceptr       cuda_d_salt_bufs;
+  CUdeviceptr       cuda_d_esalt_bufs;
+  CUdeviceptr       cuda_d_tmps;
+  CUdeviceptr       cuda_d_hooks;
+  CUdeviceptr       cuda_d_result;
+  CUdeviceptr       cuda_d_extra0_buf;
+  CUdeviceptr       cuda_d_extra1_buf;
+  CUdeviceptr       cuda_d_extra2_buf;
+  CUdeviceptr       cuda_d_extra3_buf;
+  CUdeviceptr       cuda_d_root_css_buf;
+  CUdeviceptr       cuda_d_markov_css_buf;
+  CUdeviceptr       cuda_d_st_digests_buf;
+  CUdeviceptr       cuda_d_st_salts_buf;
+  CUdeviceptr       cuda_d_st_esalts_buf;
+
+  // API: opencl
+
+  bool              is_opencl;
+
+  char             *opencl_driver_version;
+  char             *opencl_device_vendor;
+  char             *opencl_device_version;
+  char             *opencl_device_c_version;
+
+  cl_device_type    opencl_device_type;
+  cl_uint           opencl_device_vendor_id;
+  cl_uint           opencl_platform_vendor_id;
+
+  cl_device_id      opencl_device;
+  cl_context        opencl_context;
+  cl_command_queue  opencl_command_queue;
+
+  cl_program        opencl_program;
+  cl_program        opencl_program_mp;
+  cl_program        opencl_program_amp;
+
+  cl_kernel         opencl_kernel1;
+  cl_kernel         opencl_kernel12;
+  cl_kernel         opencl_kernel2;
+  cl_kernel         opencl_kernel23;
+  cl_kernel         opencl_kernel3;
+  cl_kernel         opencl_kernel4;
+  cl_kernel         opencl_kernel_init2;
+  cl_kernel         opencl_kernel_loop2;
+  cl_kernel         opencl_kernel_mp;
+  cl_kernel         opencl_kernel_mp_l;
+  cl_kernel         opencl_kernel_mp_r;
+  cl_kernel         opencl_kernel_amp;
+  cl_kernel         opencl_kernel_tm;
+  cl_kernel         opencl_kernel_memset;
+  cl_kernel         opencl_kernel_atinit;
+  cl_kernel         opencl_kernel_decompress;
+  cl_kernel         opencl_kernel_aux1;
+  cl_kernel         opencl_kernel_aux2;
+  cl_kernel         opencl_kernel_aux3;
+  cl_kernel         opencl_kernel_aux4;
+
+  cl_mem            opencl_d_pws_buf;
+  cl_mem            opencl_d_pws_amp_buf;
+  cl_mem            opencl_d_pws_comp_buf;
+  cl_mem            opencl_d_pws_idx;
+  cl_mem            opencl_d_words_buf_l;
+  cl_mem            opencl_d_words_buf_r;
+  cl_mem            opencl_d_rules;
+  cl_mem            opencl_d_rules_c;
+  cl_mem            opencl_d_combs;
+  cl_mem            opencl_d_combs_c;
+  cl_mem            opencl_d_bfs;
+  cl_mem            opencl_d_bfs_c;
+  cl_mem            opencl_d_tm_c;
+  cl_mem            opencl_d_bitmap_s1_a;
+  cl_mem            opencl_d_bitmap_s1_b;
+  cl_mem            opencl_d_bitmap_s1_c;
+  cl_mem            opencl_d_bitmap_s1_d;
+  cl_mem            opencl_d_bitmap_s2_a;
+  cl_mem            opencl_d_bitmap_s2_b;
+  cl_mem            opencl_d_bitmap_s2_c;
+  cl_mem            opencl_d_bitmap_s2_d;
+  cl_mem            opencl_d_plain_bufs;
+  cl_mem            opencl_d_digests_buf;
+  cl_mem            opencl_d_digests_shown;
+  cl_mem            opencl_d_salt_bufs;
+  cl_mem            opencl_d_esalt_bufs;
+  cl_mem            opencl_d_tmps;
+  cl_mem            opencl_d_hooks;
+  cl_mem            opencl_d_result;
+  cl_mem            opencl_d_extra0_buf;
+  cl_mem            opencl_d_extra1_buf;
+  cl_mem            opencl_d_extra2_buf;
+  cl_mem            opencl_d_extra3_buf;
+  cl_mem            opencl_d_root_css_buf;
+  cl_mem            opencl_d_markov_css_buf;
+  cl_mem            opencl_d_st_digests_buf;
+  cl_mem            opencl_d_st_salts_buf;
+  cl_mem            opencl_d_st_esalts_buf;
+
 } hc_device_param_t;
 
-typedef struct opencl_ctx
+typedef struct backend_ctx
 {
   bool                enabled;
 
   void               *ocl;
+  void               *cuda;
+  void               *nvrtc;
 
-  cl_uint             platforms_cnt;
-  cl_platform_id     *platforms;
-  char              **platforms_vendor;
-  char              **platforms_name;
-  char              **platforms_version;
-  bool               *platforms_skipped;
+  int                 backend_device_from_cuda[DEVICES_MAX];                              // from cuda device index to backend device index
+  int                 backend_device_from_opencl[DEVICES_MAX];                            // from opencl device index to backend device index
+  int                 backend_device_from_opencl_platform[CL_PLATFORMS_MAX][DEVICES_MAX]; // from opencl device index to backend device index (by platform)
 
-  cl_uint             platform_devices_cnt;
-  cl_device_id       *platform_devices;
+  int                 backend_devices_cnt;
+  int                 backend_devices_active;
+  int                 cuda_devices_cnt;
+  int                 cuda_devices_active;
+  int                 opencl_devices_cnt;
+  int                 opencl_devices_active;
 
-  u32                 devices_cnt;
-  u32                 devices_active;
+  u64                 backend_devices_filter;
 
   hc_device_param_t  *devices_param;
 
@@ -1354,10 +1429,6 @@ typedef struct opencl_ctx
   u64                 kernel_power_all;
   u64                 kernel_power_final; // we save that so that all divisions are done from the same base
 
-  u64                 opencl_platforms_filter;
-  u64                 devices_filter;
-  cl_device_type      device_types_filter;
-
   double              target_msec;
 
   bool                need_adl;
@@ -1369,7 +1440,24 @@ typedef struct opencl_ctx
 
   int                 force_jit_compilation;
 
-} opencl_ctx_t;
+  // cuda
+
+  int                 cuda_driver_version;
+
+  // opencl
+
+  cl_platform_id     *opencl_platforms;
+  cl_uint             opencl_platforms_cnt;
+  cl_device_id      **opencl_platforms_devices;
+  cl_uint            *opencl_platforms_devices_cnt;
+  char              **opencl_platforms_name;
+  char              **opencl_platforms_vendor;
+  cl_uint            *opencl_platforms_vendor_id;
+  char              **opencl_platforms_version;
+
+  cl_device_type      opencl_device_types_filter;
+
+} backend_ctx_t;
 
 typedef enum kernel_workload
 {
@@ -1690,7 +1778,7 @@ typedef struct user_options
   bool         kernel_threads_chgd;
   bool         nonce_error_corrections_chgd;
   bool         spin_damp_chgd;
-  bool         opencl_vector_width_chgd;
+  bool         backend_vector_width_chgd;
   bool         outfile_format_chgd;
   bool         remove_timer_chgd;
   bool         rp_gen_seed_chgd;
@@ -1722,7 +1810,7 @@ typedef struct user_options
   bool         machine_readable;
   bool         markov_classic;
   bool         markov_disable;
-  bool         opencl_info;
+  bool         backend_info;
   bool         optimized_kernel_enable;
   bool         outfile_autohex;
   bool         potfile_disable;
@@ -1754,9 +1842,8 @@ typedef struct user_options
   char        *induction_dir;
   char        *keyboard_layout_mapping;
   char        *markov_hcstat2;
-  char        *opencl_devices;
+  char        *backend_devices;
   char        *opencl_device_types;
-  char        *opencl_platforms;
   char        *outfile;
   char        *outfile_check_dir;
   char        *potfile_path;
@@ -1794,7 +1881,7 @@ typedef struct user_options
   u32          markov_threshold;
   u32          nonce_error_corrections;
   u32          spin_damp;
-  u32          opencl_vector_width;
+  u32          backend_vector_width;
   u32          outfile_check_timer;
   u32          outfile_format;
   u32          remove_timer;
@@ -2299,7 +2386,7 @@ typedef struct hashcat_ctx
   loopback_ctx_t        *loopback_ctx;
   mask_ctx_t            *mask_ctx;
   module_ctx_t          *module_ctx;
-  opencl_ctx_t          *opencl_ctx;
+  backend_ctx_t         *backend_ctx;
   outcheck_ctx_t        *outcheck_ctx;
   outfile_ctx_t         *outfile_ctx;
   pidfile_ctx_t         *pidfile_ctx;
diff --git a/src/Makefile b/src/Makefile
index 96f75d639..a2610f8de 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -292,13 +292,13 @@ endif # MSYS2
 ## Objects
 ##
 
-EMU_OBJS_ALL            := emu_general emu_inc_common emu_inc_scalar emu_inc_simd
+EMU_OBJS_ALL            := emu_general emu_inc_common emu_inc_platform emu_inc_scalar emu_inc_simd
 EMU_OBJS_ALL            += emu_inc_rp emu_inc_rp_optimized
 EMU_OBJS_ALL            += emu_inc_truecrypt_crc32 emu_inc_truecrypt_keyfile emu_inc_truecrypt_xts emu_inc_veracrypt_xts
 EMU_OBJS_ALL            += emu_inc_hash_md4 emu_inc_hash_md5 emu_inc_hash_ripemd160 emu_inc_hash_sha1 emu_inc_hash_sha256 emu_inc_hash_sha384 emu_inc_hash_sha512 emu_inc_hash_streebog256 emu_inc_hash_streebog512
 EMU_OBJS_ALL            += emu_inc_cipher_aes emu_inc_cipher_camellia emu_inc_cipher_des emu_inc_cipher_kuznyechik emu_inc_cipher_serpent emu_inc_cipher_twofish
 
-OBJS_ALL                := affinity autotune benchmark bitmap bitops combinator common convert cpt cpu_crc32 debugfile dictstat dispatch dynloader event ext_ADL ext_nvapi ext_nvml ext_OpenCL ext_sysfs ext_lzma filehandling folder hashcat hashes hlfmt hwmon induct interface keyboard_layout locking logfile loopback memory monitor mpsp opencl outfile_check outfile pidfile potfile restore rp rp_cpu selftest slow_candidates shared status stdout straight terminal thread timer tuningdb usage user_options wordlist $(EMU_OBJS_ALL)
+OBJS_ALL                := affinity autotune backend benchmark bitmap bitops combinator common convert cpt cpu_crc32 debugfile dictstat dispatch dynloader event ext_ADL ext_cuda ext_nvapi ext_nvml ext_nvrtc ext_OpenCL ext_sysfs ext_lzma filehandling folder hashcat hashes hlfmt hwmon induct interface keyboard_layout locking logfile loopback memory monitor mpsp outfile_check outfile pidfile potfile restore rp rp_cpu selftest slow_candidates shared status stdout straight terminal thread timer tuningdb usage user_options wordlist $(EMU_OBJS_ALL)
 
 ifeq ($(ENABLE_BRAIN),1)
 OBJS_ALL                += brain
diff --git a/src/autotune.c b/src/autotune.c
index de54fd063..42f2b13de 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -6,7 +6,7 @@
 #include "common.h"
 #include "types.h"
 #include "event.h"
-#include "opencl.h"
+#include "backend.h"
 #include "status.h"
 #include "autotune.h"
 
@@ -47,14 +47,63 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
   return exec_msec_prev;
 }
 
+/*
+static double try_run_preferred (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops)
+{
+  hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
+
+  device_param->kernel_params_buf32[28] = 0;
+  device_param->kernel_params_buf32[29] = kernel_loops; // not a bug, both need to be set
+  device_param->kernel_params_buf32[30] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
+
+  const u32 kernel_power_try = device_param->hardware_power * kernel_accel;
+
+  const u32 kernel_threads_sav = device_param->kernel_threads;
+
+  const double spin_damp_sav = device_param->spin_damp;
+
+  device_param->spin_damp = 0;
+
+  if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+  {
+    if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+    {
+      device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple1;
+
+      run_kernel (hashcat_ctx, device_param, KERN_RUN_1, kernel_power_try, true, 0);
+    }
+    else
+    {
+      device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple4;
+
+      run_kernel (hashcat_ctx, device_param, KERN_RUN_4, kernel_power_try, true, 0);
+    }
+  }
+  else
+  {
+    device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple2;
+
+    run_kernel (hashcat_ctx, device_param, KERN_RUN_2, kernel_power_try, true, 0);
+  }
+
+  device_param->kernel_threads = kernel_threads_sav;
+
+  device_param->spin_damp = spin_damp_sav;
+
+  const double exec_msec_prev = get_avg_exec_time (device_param, 1);
+
+  return exec_msec_prev;
+}
+*/
+
 static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 {
   const hashconfig_t    *hashconfig   = hashcat_ctx->hashconfig;
-  const opencl_ctx_t    *opencl_ctx   = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t   *backend_ctx  = hashcat_ctx->backend_ctx;
   const straight_ctx_t  *straight_ctx = hashcat_ctx->straight_ctx;
   const user_options_t  *user_options = hashcat_ctx->user_options;
 
-  const double target_msec = opencl_ctx->target_msec;
+  const double target_msec = backend_ctx->target_msec;
 
   const u32 kernel_accel_min = device_param->kernel_accel_min;
   const u32 kernel_accel_max = device_param->kernel_accel_max;
@@ -104,10 +153,21 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
   const u32 kernel_power_max = device_param->hardware_power * kernel_accel_max;
 
   int CL_rc;
+  int CU_rc;
 
-  CL_rc = run_kernel_atinit (hashcat_ctx, device_param, device_param->d_pws_buf, kernel_power_max);
+  if (device_param->is_cuda == true)
+  {
+    CU_rc = run_cuda_kernel_atinit (hashcat_ctx, device_param, device_param->cuda_d_pws_buf, kernel_power_max);
 
-  if (CL_rc == -1) return -1;
+    if (CU_rc == -1) return -1;
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    CL_rc = run_opencl_kernel_atinit (hashcat_ctx, device_param, device_param->opencl_d_pws_buf, kernel_power_max);
+
+    if (CL_rc == -1) return -1;
+  }
 
   if (user_options->slow_candidates == true)
   {
@@ -118,9 +178,19 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     {
       if (straight_ctx->kernel_rules_cnt > 1)
       {
-        CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->command_queue, device_param->d_rules, device_param->d_rules_c, 0, 0, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), 0, NULL, NULL);
+        if (device_param->is_cuda == true)
+        {
+          CU_rc = hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t));
 
-        if (CL_rc == -1) return -1;
+          if (CU_rc == -1) return -1;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, device_param->opencl_d_rules_c, 0, 0, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), 0, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+        }
       }
     }
   }
@@ -131,15 +201,17 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
   if (1)
   {
-    const double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min);
+    double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min);
 
     if (exec_msec > 2000)
     {
-      event_log_error (hashcat_ctx, "OpenCL kernel minimum runtime larger than default TDR");
+      event_log_error (hashcat_ctx, "Kernel minimum runtime larger than default TDR");
 
       return -1;
     }
 
+    exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min);
+
     const u32 mm = kernel_loops_max / kernel_loops_min;
 
     if ((exec_msec * mm) > target_msec)
@@ -193,6 +265,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     const u32 kernel_accel_orig = kernel_accel;
     const u32 kernel_loops_orig = kernel_loops;
 
+    double exec_msec_prev = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
+
     for (int i = 1; i < STEPS_CNT; i++)
     {
       const u32 kernel_accel_try = kernel_accel_orig * (1u << i);
@@ -204,6 +278,16 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       if (kernel_loops_try > kernel_loops_max) continue;
       if (kernel_loops_try < kernel_loops_min) break;
 
+      // do a real test
+
+      const double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try);
+
+      if (exec_msec_prev < exec_msec) break;
+
+      exec_msec_prev = exec_msec;
+
+      // so far, so good! save
+
       kernel_accel = kernel_accel_try;
       kernel_loops = kernel_loops_try;
 
@@ -228,25 +312,92 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     kernel_accel *= exec_accel_min;
   }
 
-  // reset them fake words
+  // start finding best thread count is easier.
+  // it's either the preferred or the maximum thread count
 
-  CL_rc = run_kernel_memset (hashcat_ctx, device_param, device_param->d_pws_buf, 0, device_param->size_pws);
+  /*
+  const u32 kernel_threads_min = device_param->kernel_threads_min;
+  const u32 kernel_threads_max = device_param->kernel_threads_max;
 
-  if (CL_rc == -1) return -1;
+  if (kernel_threads_min < kernel_threads_max)
+  {
+    const double exec_msec_max = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
 
-  // reset other buffers in case autotune cracked something
+    u32 preferred_threads = 0;
 
-  CL_rc = run_kernel_memset (hashcat_ctx, device_param, device_param->d_plain_bufs, 0, device_param->size_plains);
+    if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        preferred_threads = device_param->kernel_preferred_wgs_multiple1;
+      }
+      else
+      {
+        preferred_threads = device_param->kernel_preferred_wgs_multiple4;
+      }
+    }
+    else
+    {
+      preferred_threads = device_param->kernel_preferred_wgs_multiple2;
+    }
 
-  if (CL_rc == -1) return -1;
+    if ((preferred_threads >= kernel_threads_min) && (preferred_threads <= kernel_threads_max))
+    {
+      const double exec_msec_preferred = try_run_preferred (hashcat_ctx, device_param, kernel_accel, kernel_loops);
 
-  CL_rc = run_kernel_memset (hashcat_ctx, device_param, device_param->d_digests_shown, 0, device_param->size_shown);
+      if (exec_msec_preferred < exec_msec_max)
+      {
+        device_param->kernel_threads = preferred_threads;
+      }
+    }
+  }
+  */
 
-  if (CL_rc == -1) return -1;
+  if (device_param->is_cuda == true)
+  {
+    // reset them fake words
 
-  CL_rc = run_kernel_memset (hashcat_ctx, device_param, device_param->d_result, 0, device_param->size_results);
+    CU_rc = run_cuda_kernel_memset (hashcat_ctx, device_param, device_param->cuda_d_pws_buf, 0, device_param->size_pws);
 
-  if (CL_rc == -1) return -1;
+    if (CU_rc == -1) return -1;
+
+    // reset other buffers in case autotune cracked something
+
+    CU_rc = run_cuda_kernel_memset (hashcat_ctx, device_param, device_param->cuda_d_plain_bufs, 0, device_param->size_plains);
+
+    if (CU_rc == -1) return -1;
+
+    CU_rc = run_cuda_kernel_memset (hashcat_ctx, device_param, device_param->cuda_d_digests_shown, 0, device_param->size_shown);
+
+    if (CU_rc == -1) return -1;
+
+    CU_rc = run_cuda_kernel_memset (hashcat_ctx, device_param, device_param->cuda_d_result, 0, device_param->size_results);
+
+    if (CU_rc == -1) return -1;
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    // reset them fake words
+
+    CL_rc = run_opencl_kernel_memset (hashcat_ctx, device_param, device_param->opencl_d_pws_buf, 0, device_param->size_pws);
+
+    if (CL_rc == -1) return -1;
+
+    // reset other buffers in case autotune cracked something
+
+    CL_rc = run_opencl_kernel_memset (hashcat_ctx, device_param, device_param->opencl_d_plain_bufs, 0, device_param->size_plains);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = run_opencl_kernel_memset (hashcat_ctx, device_param, device_param->opencl_d_digests_shown, 0, device_param->size_shown);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = run_opencl_kernel_memset (hashcat_ctx, device_param, device_param->opencl_d_result, 0, device_param->size_results);
+
+    if (CL_rc == -1) return -1;
+  }
 
   // reset timer
 
@@ -283,16 +434,23 @@ HC_API_CALL void *thread_autotune (void *p)
 
   hashcat_ctx_t *hashcat_ctx = thread_param->hashcat_ctx;
 
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  if (opencl_ctx->enabled == false) return NULL;
+  if (backend_ctx->enabled == false) return NULL;
 
-  hc_device_param_t *device_param = opencl_ctx->devices_param + thread_param->tid;
+  hc_device_param_t *device_param = backend_ctx->devices_param + thread_param->tid;
 
   if (device_param->skipped == true) return NULL;
 
   if (device_param->skipped_warning == true) return NULL;
 
+  if (device_param->is_cuda == true)
+  {
+    const int rc_cuCtxSetCurrent = hc_cuCtxSetCurrent (hashcat_ctx, device_param->cuda_context);
+
+    if (rc_cuCtxSetCurrent == -1) return NULL;
+  }
+
   const int rc_autotune = autotune (hashcat_ctx, device_param);
 
   if (rc_autotune == -1)
diff --git a/src/backend.c b/src/backend.c
new file mode 100644
index 000000000..5ad504460
--- /dev/null
+++ b/src/backend.c
@@ -0,0 +1,11153 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "common.h"
+#include "types.h"
+#include "memory.h"
+#include "locking.h"
+#include "thread.h"
+#include "timer.h"
+#include "tuningdb.h"
+#include "rp.h"
+#include "rp_cpu.h"
+#include "mpsp.h"
+#include "convert.h"
+#include "stdout.h"
+#include "filehandling.h"
+#include "wordlist.h"
+#include "shared.h"
+#include "hashes.h"
+#include "emu_inc_hash_md5.h"
+#include "event.h"
+#include "dynloader.h"
+#include "backend.h"
+
+#if defined (__linux__)
+static const char *dri_card0_path = "/dev/dri/card0";
+
+static const char *drm_card0_vendor_path = "/sys/class/drm/card0/device/vendor";
+static const char *drm_card0_driver_path = "/sys/class/drm/card0/device/driver";
+#endif
+
+static const u32 full01 = 0x01010101;
+static const u32 full06 = 0x06060606;
+static const u32 full80 = 0x80808080;
+
+static double TARGET_MSEC_PROFILE[4] = { 2, 12, 96, 480 };
+
+static bool is_same_device (const hc_device_param_t *src, const hc_device_param_t *dst)
+{
+  if (src->pcie_bus      != dst->pcie_bus)      return false;
+  if (src->pcie_device   != dst->pcie_device)   return false;
+  if (src->pcie_function != dst->pcie_function) return false;
+
+  return true;
+}
+
+static int backend_ctx_find_alias_devices (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  for (int backend_devices_cnt_src = 0; backend_devices_cnt_src < backend_ctx->backend_devices_cnt; backend_devices_cnt_src++)
+  {
+    hc_device_param_t *device_param_src = &backend_ctx->devices_param[backend_devices_cnt_src];
+
+    if (device_param_src->skipped == true) continue;
+
+    if (device_param_src->skipped_warning == true) continue;
+
+    for (int backend_devices_cnt_dst = backend_devices_cnt_src + 1; backend_devices_cnt_dst < backend_ctx->backend_devices_cnt; backend_devices_cnt_dst++)
+    {
+      hc_device_param_t *device_param_dst = &backend_ctx->devices_param[backend_devices_cnt_dst];
+
+      if (device_param_dst->skipped == true) continue;
+
+      if (device_param_dst->skipped_warning == true) continue;
+
+      if (is_same_device (device_param_src, device_param_dst) == false) continue;
+
+      device_param_src->device_id_alias_buf[device_param_src->device_id_alias_cnt] = device_param_dst->device_id;
+      device_param_src->device_id_alias_cnt++;
+
+      device_param_dst->device_id_alias_buf[device_param_dst->device_id_alias_cnt] = device_param_src->device_id;
+      device_param_dst->device_id_alias_cnt++;
+
+      if (device_param_dst->is_opencl == true)
+      {
+        if (device_param_dst->skipped == false)
+        {
+          device_param_dst->skipped = true;
+
+          backend_ctx->opencl_devices_active--;
+
+          backend_ctx->backend_devices_active--;
+        }
+      }
+    }
+  }
+
+  return -1;
+}
+
+static bool is_same_device_type (const hc_device_param_t *src, const hc_device_param_t *dst)
+{
+  if (strcmp (src->device_name, dst->device_name) != 0) return false;
+
+  if (src->is_cuda   != dst->is_cuda)   return false;
+  if (src->is_opencl != dst->is_opencl) return false;
+
+  if (src->is_cuda == true)
+  {
+    if (strcmp (src->opencl_device_vendor,  dst->opencl_device_vendor)  != 0) return false;
+    if (strcmp (src->opencl_device_version, dst->opencl_device_version) != 0) return false;
+    if (strcmp (src->opencl_driver_version, dst->opencl_driver_version) != 0) return false;
+  }
+
+  if (src->device_processors         != dst->device_processors)         return false;
+  if (src->device_maxclock_frequency != dst->device_maxclock_frequency) return false;
+  if (src->device_maxworkgroup_size  != dst->device_maxworkgroup_size)  return false;
+
+  // memory size can be different, depending on which gpu has a monitor connected
+  // if (src->device_maxmem_alloc       != dst->device_maxmem_alloc)       return false;
+  // if (src->device_global_mem         != dst->device_global_mem)         return false;
+
+  if (src->sm_major != dst->sm_major) return false;
+  if (src->sm_minor != dst->sm_minor) return false;
+
+  if (src->kernel_exec_timeout != dst->kernel_exec_timeout) return false;
+
+  return true;
+}
+
+static int ocl_check_dri (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx)
+{
+  #if defined (__linux__)
+
+  // This check makes sense only if we're not root
+
+  const uid_t uid = getuid ();
+
+  if (uid == 0) return 0;
+
+  // No GPU available! That's fine, so we don't need to check if we have access to it.
+
+  if (hc_path_exist (dri_card0_path) == false) return 0;
+
+  // Now we need to check if this an AMD vendor, because this is when the problems start
+
+  FILE *fd_drm = fopen (drm_card0_vendor_path, "rb");
+
+  if (fd_drm == NULL) return 0;
+
+  u32 vendor = 0;
+
+  if (fscanf (fd_drm, "0x%x", &vendor) != 1)
+  {
+    fclose (fd_drm);
+
+    return 0;
+  }
+
+  fclose (fd_drm);
+
+  if (vendor != 4098) return 0;
+
+  // Now the problem is only with AMDGPU-PRO, not with oldschool AMD driver
+
+  char buf[HCBUFSIZ_TINY];
+
+  const ssize_t len = readlink (drm_card0_driver_path, buf, HCBUFSIZ_TINY - 1);
+
+  if (len == -1) return 0;
+
+  buf[len] = 0;
+
+  if (strstr (buf, "amdgpu") == NULL) return 0;
+
+  // Now do the real check
+
+  FILE *fd_dri = fopen (dri_card0_path, "rb");
+
+  if (fd_dri == NULL)
+  {
+    event_log_error (hashcat_ctx, "Cannot access %s: %m.", dri_card0_path);
+
+    event_log_warning (hashcat_ctx, "This causes some drivers to crash when OpenCL is used!");
+    event_log_warning (hashcat_ctx, "Adding your user to the \"video\" group usually fixes this problem:");
+    event_log_warning (hashcat_ctx, "$ sudo usermod -a -G video $LOGNAME");
+    event_log_warning (hashcat_ctx, NULL);
+
+    return -1;
+  }
+
+  fclose (fd_dri);
+
+  #endif // __linux__
+
+  return 0;
+}
+
+static bool setup_backend_devices_filter (hashcat_ctx_t *hashcat_ctx, const char *backend_devices, u64 *out)
+{
+  u64 backend_devices_filter = 0;
+
+  if (backend_devices)
+  {
+    char *devices = hcstrdup (backend_devices);
+
+    if (devices == NULL) return false;
+
+    char *saveptr = NULL;
+
+    char *next = strtok_r (devices, ",", &saveptr);
+
+    do
+    {
+      const int backend_device_id = (const int) strtol (next, NULL, 10);
+
+      if ((backend_device_id <= 0) || (backend_device_id >= 64))
+      {
+        event_log_error (hashcat_ctx, "Invalid device_id %d specified.", backend_device_id);
+
+        hcfree (devices);
+
+        return false;
+      }
+
+      backend_devices_filter |= 1ULL << (backend_device_id - 1);
+
+    } while ((next = strtok_r ((char *) NULL, ",", &saveptr)) != NULL);
+
+    hcfree (devices);
+  }
+  else
+  {
+    backend_devices_filter = -1ULL;
+  }
+
+  *out = backend_devices_filter;
+
+  return true;
+}
+
+static bool setup_opencl_device_types_filter (hashcat_ctx_t *hashcat_ctx, const char *opencl_device_types, cl_device_type *out)
+{
+  cl_device_type opencl_device_types_filter = 0;
+
+  if (opencl_device_types)
+  {
+    char *device_types = hcstrdup (opencl_device_types);
+
+    if (device_types == NULL) return false;
+
+    char *saveptr = NULL;
+
+    char *next = strtok_r (device_types, ",", &saveptr);
+
+    do
+    {
+      const int device_type = (const int) strtol (next, NULL, 10);
+
+      if (device_type < 1 || device_type > 3)
+      {
+        event_log_error (hashcat_ctx, "Invalid OpenCL device-type %d specified.", device_type);
+
+        hcfree (device_types);
+
+        return false;
+      }
+
+      opencl_device_types_filter |= 1u << device_type;
+
+    } while ((next = strtok_r (NULL, ",", &saveptr)) != NULL);
+
+    hcfree (device_types);
+  }
+  else
+  {
+    // Do not use CPU by default, this often reduces GPU performance because
+    // the CPU is too busy to handle GPU synchronization
+
+    opencl_device_types_filter = CL_DEVICE_TYPE_ALL & ~CL_DEVICE_TYPE_CPU;
+  }
+
+  *out = opencl_device_types_filter;
+
+  return true;
+}
+
+static bool cuda_test_instruction (hashcat_ctx_t *hashcat_ctx, const int sm_major, const int sm_minor, const char *kernel_buf)
+{
+  nvrtcProgram program;
+
+  const int rc_nvrtcCreateProgram = hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_buf, "test_instruction", 0, NULL, NULL);
+
+  if (rc_nvrtcCreateProgram == -1) return false;
+
+  char *nvrtc_options[3];
+
+  nvrtc_options[0] = "--gpu-architecture";
+
+  hc_asprintf (&nvrtc_options[1], "compute_%d%d", sm_major, sm_minor);
+
+  nvrtc_options[2] = NULL;
+
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  NVRTC_PTR *nvrtc = backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcCompileProgram (program, 2, (const char * const *) nvrtc_options);
+
+  hcfree (nvrtc_options[1]);
+
+  size_t build_log_size = 0;
+
+  hc_nvrtcGetProgramLogSize (hashcat_ctx, program, &build_log_size);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    char *build_log = (char *) hcmalloc (build_log_size + 1);
+
+    const int rc_nvrtcGetProgramLog = hc_nvrtcGetProgramLog (hashcat_ctx, program, build_log);
+
+    if (rc_nvrtcGetProgramLog == -1) return false;
+
+    puts (build_log);
+
+    hcfree (build_log);
+
+    hc_nvrtcDestroyProgram (hashcat_ctx, &program);
+
+    return false;
+  }
+
+  size_t binary_size;
+
+  const int rc_nvrtcGetPTXSize = hc_nvrtcGetPTXSize (hashcat_ctx, program, &binary_size);
+
+  if (rc_nvrtcGetPTXSize == -1) return false;
+
+  char *binary = (char *) hcmalloc (binary_size);
+
+  const int nvrtcGetPTX = hc_nvrtcGetPTX (hashcat_ctx, program, binary);
+
+  if (nvrtcGetPTX == -1)
+  {
+    hcfree (binary);
+
+    return false;
+  }
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  CUmodule cuda_module;
+
+  const CUresult CU_err = cuda->cuModuleLoadDataEx (&cuda_module, binary, 0, NULL, NULL);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    hcfree (binary);
+
+    return false;
+  }
+
+  hcfree (binary);
+
+  const int rc_cuModuleUnload = hc_cuModuleUnload (hashcat_ctx, cuda_module);
+
+  if (rc_cuModuleUnload == -1) return false;
+
+  const int rc_nvrtcDestroyProgram = hc_nvrtcDestroyProgram (hashcat_ctx, &program);
+
+  if (rc_nvrtcDestroyProgram == -1) return false;
+
+  return true;
+}
+
+static bool opencl_test_instruction (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_device_id device, const char *kernel_buf)
+{
+  int CL_rc;
+
+  cl_program program;
+
+  CL_rc = hc_clCreateProgramWithSource (hashcat_ctx, context, 1, &kernel_buf, NULL, &program);
+
+  if (CL_rc == -1) return false;
+
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  // LLVM seems to write an error message (if there's an error) directly to stderr
+  // and not (as supposted to) into buffer for later request using clGetProgramBuildInfo()
+
+  #ifndef DEBUG
+  #ifndef _WIN
+  fflush (stderr);
+  int bak = dup (2);
+  int tmp = open ("/dev/null", O_WRONLY);
+  dup2 (tmp, 2);
+  close (tmp);
+  #endif
+  #endif
+
+  CL_rc = ocl->clBuildProgram (program, 1, &device, "-Werror", NULL, NULL); // do not use the wrapper to avoid the error message
+
+  #ifndef DEBUG
+  #ifndef _WIN
+  fflush (stderr);
+  dup2 (bak, 2);
+  close (bak);
+  #endif
+  #endif
+
+  if (CL_rc != CL_SUCCESS)
+  {
+    #if defined (DEBUG)
+
+    event_log_error (hashcat_ctx, "clBuildProgram(): %s", val2cstr_cl (CL_rc));
+
+    size_t build_log_size = 0;
+
+    hc_clGetProgramBuildInfo (hashcat_ctx, program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
+
+    char *build_log = (char *) hcmalloc (build_log_size + 1);
+
+    hc_clGetProgramBuildInfo (hashcat_ctx, program, device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
+
+    build_log[build_log_size] = 0;
+
+    puts (build_log);
+
+    hcfree (build_log);
+
+    #endif
+
+    hc_clReleaseProgram (hashcat_ctx, program);
+
+    return false;
+  }
+
+  CL_rc = hc_clReleaseProgram (hashcat_ctx, program);
+
+  if (CL_rc == -1) return false;
+
+  return true;
+}
+
+static bool read_kernel_binary (hashcat_ctx_t *hashcat_ctx, const char *kernel_file, size_t *kernel_lengths, char **kernel_sources, const bool force_recompile)
+{
+  FILE *fp = fopen (kernel_file, "rb");
+
+  if (fp != NULL)
+  {
+    struct stat st;
+
+    if (stat (kernel_file, &st))
+    {
+      fclose (fp);
+
+      return false;
+    }
+
+    #define EXTRASZ 100
+
+    char *buf = (char *) hcmalloc (st.st_size + 1 + EXTRASZ);
+
+    size_t num_read = hc_fread (buf, sizeof (char), st.st_size, fp);
+
+    fclose (fp);
+
+    if (num_read != (size_t) st.st_size)
+    {
+      event_log_error (hashcat_ctx, "%s: %s", kernel_file, strerror (errno));
+
+      hcfree (buf);
+
+      return false;
+    }
+
+    buf[st.st_size] = 0;
+
+    if (force_recompile == true)
+    {
+      // this adds some hopefully unique data to the backend kernel source
+      // the effect should be that backend kernel compiler caching see this as new "uncached" source
+      // we have to do this since they do not check for the changes only in the #include source
+
+      time_t tlog = time (NULL);
+
+      const int extra_len = snprintf (buf + st.st_size, EXTRASZ, "\n//%u\n", (u32) tlog);
+
+      st.st_size += extra_len;
+    }
+
+    kernel_lengths[0] = (size_t) st.st_size;
+
+    kernel_sources[0] = buf;
+  }
+  else
+  {
+    event_log_error (hashcat_ctx, "%s: %s", kernel_file, strerror (errno));
+
+    return false;
+  }
+
+  return true;
+}
+
+static bool write_kernel_binary (hashcat_ctx_t *hashcat_ctx, char *kernel_file, char *binary, size_t binary_size)
+{
+  if (binary_size > 0)
+  {
+    FILE *fp = fopen (kernel_file, "wb");
+
+    if (fp == NULL)
+    {
+      event_log_error (hashcat_ctx, "%s: %s", kernel_file, strerror (errno));
+
+      return false;
+    }
+
+    if (lock_file (fp) == -1)
+    {
+      fclose (fp);
+
+      event_log_error (hashcat_ctx, "%s: %s", kernel_file, strerror (errno));
+
+      return false;
+    }
+
+    hc_fwrite (binary, sizeof (char), binary_size, fp);
+
+    fflush (fp);
+
+    fclose (fp);
+  }
+
+  return true;
+}
+
+void generate_source_kernel_filename (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *shared_dir, char *source_file)
+{
+  if (opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+  {
+    if (attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+    {
+      if (slow_candidates == true)
+      {
+        snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-optimized.cl", shared_dir, (int) kern_type);
+      }
+      else
+      {
+        if (attack_kern == ATTACK_KERN_STRAIGHT)
+          snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-optimized.cl", shared_dir, (int) kern_type);
+        else if (attack_kern == ATTACK_KERN_COMBI)
+          snprintf (source_file, 255, "%s/OpenCL/m%05d_a1-optimized.cl", shared_dir, (int) kern_type);
+        else if (attack_kern == ATTACK_KERN_BF)
+          snprintf (source_file, 255, "%s/OpenCL/m%05d_a3-optimized.cl", shared_dir, (int) kern_type);
+        else if (attack_kern == ATTACK_KERN_NONE)
+          snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-optimized.cl", shared_dir, (int) kern_type);
+      }
+    }
+    else
+    {
+      snprintf (source_file, 255, "%s/OpenCL/m%05d-optimized.cl", shared_dir, (int) kern_type);
+    }
+  }
+  else
+  {
+    if (attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+    {
+      if (slow_candidates == true)
+      {
+        snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-pure.cl", shared_dir, (int) kern_type);
+      }
+      else
+      {
+        if (attack_kern == ATTACK_KERN_STRAIGHT)
+          snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-pure.cl", shared_dir, (int) kern_type);
+        else if (attack_kern == ATTACK_KERN_COMBI)
+          snprintf (source_file, 255, "%s/OpenCL/m%05d_a1-pure.cl", shared_dir, (int) kern_type);
+        else if (attack_kern == ATTACK_KERN_BF)
+          snprintf (source_file, 255, "%s/OpenCL/m%05d_a3-pure.cl", shared_dir, (int) kern_type);
+        else if (attack_kern == ATTACK_KERN_NONE)
+          snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-pure.cl", shared_dir, (int) kern_type);
+      }
+    }
+    else
+    {
+      snprintf (source_file, 255, "%s/OpenCL/m%05d-pure.cl", shared_dir, (int) kern_type);
+    }
+  }
+}
+
+void generate_cached_kernel_filename (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *profile_dir, const char *device_name_chksum, char *cached_file)
+{
+  if (opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+  {
+    if (attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+    {
+      if (slow_candidates == true)
+      {
+        snprintf (cached_file, 255, "%s/kernels/m%05d_a0-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+      }
+      else
+      {
+        if (attack_kern == ATTACK_KERN_STRAIGHT)
+          snprintf (cached_file, 255, "%s/kernels/m%05d_a0-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+        else if (attack_kern == ATTACK_KERN_COMBI)
+          snprintf (cached_file, 255, "%s/kernels/m%05d_a1-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+        else if (attack_kern == ATTACK_KERN_BF)
+          snprintf (cached_file, 255, "%s/kernels/m%05d_a3-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+        else if (attack_kern == ATTACK_KERN_NONE)
+          snprintf (cached_file, 255, "%s/kernels/m%05d_a0-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+      }
+    }
+    else
+    {
+      snprintf (cached_file, 255, "%s/kernels/m%05d-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+    }
+  }
+  else
+  {
+    if (attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+    {
+      if (slow_candidates == true)
+      {
+        snprintf (cached_file, 255, "%s/kernels/m%05d_a0-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+      }
+      else
+      {
+        if (attack_kern == ATTACK_KERN_STRAIGHT)
+          snprintf (cached_file, 255, "%s/kernels/m%05d_a0-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+        else if (attack_kern == ATTACK_KERN_COMBI)
+          snprintf (cached_file, 255, "%s/kernels/m%05d_a1-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+        else if (attack_kern == ATTACK_KERN_BF)
+          snprintf (cached_file, 255, "%s/kernels/m%05d_a3-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+        else if (attack_kern == ATTACK_KERN_NONE)
+          snprintf (cached_file, 255, "%s/kernels/m%05d_a0-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+      }
+    }
+    else
+    {
+      snprintf (cached_file, 255, "%s/kernels/m%05d-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
+    }
+  }
+}
+
+void generate_source_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *shared_dir, char *source_file)
+{
+  if ((opti_type & OPTI_TYPE_BRUTE_FORCE) && (opts_type & OPTS_TYPE_PT_GENERATE_BE))
+  {
+    snprintf (source_file, 255, "%s/OpenCL/markov_be.cl", shared_dir);
+  }
+  else
+  {
+    snprintf (source_file, 255, "%s/OpenCL/markov_le.cl", shared_dir);
+  }
+}
+
+void generate_cached_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *profile_dir, const char *device_name_chksum_amp_mp, char *cached_file)
+{
+  if ((opti_type & OPTI_TYPE_BRUTE_FORCE) && (opts_type & OPTS_TYPE_PT_GENERATE_BE))
+  {
+    snprintf (cached_file, 255, "%s/kernels/markov_be.%s.kernel", profile_dir, device_name_chksum_amp_mp);
+  }
+  else
+  {
+    snprintf (cached_file, 255, "%s/kernels/markov_le.%s.kernel", profile_dir, device_name_chksum_amp_mp);
+  }
+}
+
+void generate_source_kernel_amp_filename (const u32 attack_kern, char *shared_dir, char *source_file)
+{
+  snprintf (source_file, 255, "%s/OpenCL/amp_a%u.cl", shared_dir, attack_kern);
+}
+
+void generate_cached_kernel_amp_filename (const u32 attack_kern, char *profile_dir, const char *device_name_chksum_amp_mp, char *cached_file)
+{
+  snprintf (cached_file, 255, "%s/kernels/amp_a%u.%s.kernel", profile_dir, attack_kern, device_name_chksum_amp_mp);
+}
+
+// NVRTC
+
+int nvrtc_init (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  NVRTC_PTR *nvrtc = backend_ctx->nvrtc;
+
+  memset (nvrtc, 0, sizeof (NVRTC_PTR));
+
+  #if   defined (_WIN)
+  nvrtc->lib = hc_dlopen ("nvrtc.dll");
+
+  if (nvrtc->lib == NULL)
+  {
+    // super annoying: nvidia is using the CUDA version in nvrtc???.dll filename!
+    // however, the cuda version string comes from nvcuda.dll which is from nvidia driver, but
+    // the driver version and the installed CUDA toolkit version can be different, so it cannot be used as a reference.
+    // brute force to the rescue
+
+    char dllname[100];
+
+    for (int major = 20; major >= 3; major--) // older than 3.x do not ship _v2 functions anyway
+    {
+      for (int minor = 20; minor >= 0; minor--)
+      {
+        snprintf (dllname, sizeof (dllname), "nvrtc64_%d%d.dll", major, minor);
+
+        nvrtc->lib = hc_dlopen (dllname);
+
+        if (nvrtc->lib) break;
+
+        snprintf (dllname, sizeof (dllname), "nvrtc64_%d%d_0.dll", major, minor);
+
+        nvrtc->lib = hc_dlopen (dllname);
+
+        if (nvrtc->lib) break;
+      }
+
+      if (nvrtc->lib) break;
+    }
+  }
+  #elif defined (__APPLE__)
+  nvrtc->lib = hc_dlopen ("nvrtc.dylib");
+  #elif defined (__CYGWIN__)
+  nvrtc->lib = hc_dlopen ("nvrtc.dll");
+  #else
+  nvrtc->lib = hc_dlopen ("libnvrtc.so");
+
+  if (nvrtc->lib == NULL) nvrtc->lib = hc_dlopen ("libnvrtc.so.1");
+  #endif
+
+  if (nvrtc->lib == NULL) return -1;
+
+  HC_LOAD_FUNC (nvrtc, nvrtcAddNameExpression,  NVRTC_NVRTCADDNAMEEXPRESSION, NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcCompileProgram,     NVRTC_NVRTCCOMPILEPROGRAM,    NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcCreateProgram,      NVRTC_NVRTCCREATEPROGRAM,     NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcDestroyProgram,     NVRTC_NVRTCDESTROYPROGRAM,    NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetLoweredName,     NVRTC_NVRTCGETLOWEREDNAME,    NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetPTX,             NVRTC_NVRTCGETPTX,            NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetPTXSize,         NVRTC_NVRTCGETPTXSIZE,        NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetProgramLog,      NVRTC_NVRTCGETPROGRAMLOG,     NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetProgramLogSize,  NVRTC_NVRTCGETPROGRAMLOGSIZE, NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetErrorString,     NVRTC_NVRTCGETERRORSTRING,    NVRTC, 1);
+
+  return 0;
+}
+
+void nvrtc_close (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  NVRTC_PTR *nvrtc = backend_ctx->nvrtc;
+
+  if (nvrtc)
+  {
+    if (nvrtc->lib)
+    {
+      hc_dlclose (nvrtc->lib);
+    }
+
+    hcfree (backend_ctx->nvrtc);
+
+    backend_ctx->nvrtc = NULL;
+  }
+}
+
+int hc_nvrtcCreateProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  NVRTC_PTR *nvrtc = backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcCreateProgram (prog, src, name, numHeaders, headers, includeNames);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcCreateProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcDestroyProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  NVRTC_PTR *nvrtc = backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcDestroyProgram (prog);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcDestroyProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcCompileProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, int numOptions, const char * const *options)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  NVRTC_PTR *nvrtc = backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcCompileProgram (prog, numOptions, options);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcCompileProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetProgramLogSize (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *logSizeRet)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  NVRTC_PTR *nvrtc = backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetProgramLogSize (prog, logSizeRet);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetProgramLogSize(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetProgramLog (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *log)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  NVRTC_PTR *nvrtc = backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetProgramLog (prog, log);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetProgramLog(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetPTXSize (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *ptxSizeRet)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  NVRTC_PTR *nvrtc = backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetPTXSize (prog, ptxSizeRet);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetPTXSize(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetPTX (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *ptx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  NVRTC_PTR *nvrtc = backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetPTX (prog, ptx);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetPTX(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+// CUDA
+
+int cuda_init (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  memset (cuda, 0, sizeof (CUDA_PTR));
+
+  #if   defined (_WIN)
+  cuda->lib = hc_dlopen ("nvcuda.dll");
+  #elif defined (__APPLE__)
+  cuda->lib = hc_dlopen ("nvcuda.dylib");
+  #elif defined (__CYGWIN__)
+  cuda->lib = hc_dlopen ("nvcuda.dll");
+  #else
+  cuda->lib = hc_dlopen ("libcuda.so");
+
+  if (cuda->lib == NULL) cuda->lib = hc_dlopen ("libcuda.so.1");
+  #endif
+
+  if (cuda->lib == NULL) return -1;
+
+  #define HC_LOAD_FUNC_CUDA(ptr,name,cudaname,type,libname,noerr) \
+    ptr->name = (type) hc_dlsym (ptr->lib, #cudaname); \
+    if (noerr != -1) { \
+      if (!ptr->name) { \
+        if (noerr == 1) { \
+          event_log_error (hashcat_ctx, "%s is missing from %s shared library.", #name, #libname); \
+          return -1; \
+        } \
+        if (noerr != 1) { \
+          event_log_warning (hashcat_ctx, "%s is missing from %s shared library.", #name, #libname); \
+          return 0; \
+        } \
+      } \
+    }
+
+  // finding the right symbol is a PITA, because of the _v2 suffix
+  // a good reference is cuda.h itself
+  // this needs to be verified for each new cuda release
+
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxCreate,              cuCtxCreate_v2,            CUDA_CUCTXCREATE,               CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxDestroy,             cuCtxDestroy_v2,           CUDA_CUCTXDESTROY,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxGetCacheConfig,      cuCtxGetCacheConfig,       CUDA_CUCTXGETCACHECONFIG,       CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxGetCurrent,          cuCtxGetCurrent,           CUDA_CUCTXGETCURRENT,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxGetSharedMemConfig,  cuCtxGetSharedMemConfig,   CUDA_CUCTXGETSHAREDMEMCONFIG,   CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxPopCurrent,          cuCtxPopCurrent_v2,        CUDA_CUCTXPOPCURRENT,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxPushCurrent,         cuCtxPushCurrent_v2,       CUDA_CUCTXPUSHCURRENT,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxSetCacheConfig,      cuCtxSetCacheConfig,       CUDA_CUCTXSETCACHECONFIG,       CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxSetCurrent,          cuCtxSetCurrent,           CUDA_CUCTXSETCURRENT,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxSetSharedMemConfig,  cuCtxSetSharedMemConfig,   CUDA_CUCTXSETSHAREDMEMCONFIG,   CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxSynchronize,         cuCtxSynchronize,          CUDA_CUCTXSYNCHRONIZE,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGetAttribute,     cuDeviceGetAttribute,      CUDA_CUDEVICEGETATTRIBUTE,      CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGetCount,         cuDeviceGetCount,          CUDA_CUDEVICEGETCOUNT,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGet,              cuDeviceGet,               CUDA_CUDEVICEGET,               CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGetName,          cuDeviceGetName,           CUDA_CUDEVICEGETNAME,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDeviceTotalMem,         cuDeviceTotalMem_v2,       CUDA_CUDEVICETOTALMEM,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDriverGetVersion,       cuDriverGetVersion,        CUDA_CUDRIVERGETVERSION,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventCreate,            cuEventCreate,             CUDA_CUEVENTCREATE,             CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventDestroy,           cuEventDestroy_v2,         CUDA_CUEVENTDESTROY,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventElapsedTime,       cuEventElapsedTime,        CUDA_CUEVENTELAPSEDTIME,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventQuery,             cuEventQuery,              CUDA_CUEVENTQUERY,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventRecord,            cuEventRecord,             CUDA_CUEVENTRECORD,             CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventSynchronize,       cuEventSynchronize,        CUDA_CUEVENTSYNCHRONIZE,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuFuncGetAttribute,       cuFuncGetAttribute,        CUDA_CUFUNCGETATTRIBUTE,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuFuncSetAttribute,       cuFuncSetAttribute,        CUDA_CUFUNCSETATTRIBUTE,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuFuncSetCacheConfig,     cuFuncSetCacheConfig,      CUDA_CUFUNCSETCACHECONFIG,      CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuFuncSetSharedMemConfig, cuFuncSetSharedMemConfig,  CUDA_CUFUNCSETSHAREDMEMCONFIG,  CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuGetErrorName,           cuGetErrorName,            CUDA_CUGETERRORNAME,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuGetErrorString,         cuGetErrorString,          CUDA_CUGETERRORSTRING,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuInit,                   cuInit,                    CUDA_CUINIT,                    CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuLaunchKernel,           cuLaunchKernel,            CUDA_CULAUNCHKERNEL,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemAlloc,               cuMemAlloc_v2,             CUDA_CUMEMALLOC,                CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemAllocHost,           cuMemAllocHost_v2,         CUDA_CUMEMALLOCHOST,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoD,             cuMemcpyDtoD_v2,           CUDA_CUMEMCPYDTOD,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoH,             cuMemcpyDtoH_v2,           CUDA_CUMEMCPYDTOH,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoD,             cuMemcpyHtoD_v2,           CUDA_CUMEMCPYHTOD,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemFree,                cuMemFree_v2,              CUDA_CUMEMFREE,                 CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemFreeHost,            cuMemFreeHost,             CUDA_CUMEMFREEHOST,             CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemGetInfo,             cuMemGetInfo_v2,           CUDA_CUMEMGETINFO,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32,              cuMemsetD32_v2,            CUDA_CUMEMSETD32,               CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8,               cuMemsetD8_v2,             CUDA_CUMEMSETD8,                CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleGetFunction,      cuModuleGetFunction,       CUDA_CUMODULEGETFUNCTION,       CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleGetGlobal,        cuModuleGetGlobal_v2,      CUDA_CUMODULEGETGLOBAL,         CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleLoad,             cuModuleLoad,              CUDA_CUMODULELOAD,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleLoadData,         cuModuleLoadData,          CUDA_CUMODULELOADDATA,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleLoadDataEx,       cuModuleLoadDataEx,        CUDA_CUMODULELOADDATAEX,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleUnload,           cuModuleUnload,            CUDA_CUMODULEUNLOAD,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuProfilerStart,          cuProfilerStart,           CUDA_CUPROFILERSTART,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuProfilerStop,           cuProfilerStop,            CUDA_CUPROFILERSTOP,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuStreamCreate,           cuStreamCreate,            CUDA_CUSTREAMCREATE,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuStreamDestroy,          cuStreamDestroy_v2,        CUDA_CUSTREAMDESTROY,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuStreamSynchronize,      cuStreamSynchronize,       CUDA_CUSTREAMSYNCHRONIZE,       CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuStreamWaitEvent,        cuStreamWaitEvent,         CUDA_CUSTREAMWAITEVENT,         CUDA, 1);
+
+  return 0;
+}
+
+void cuda_close (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  if (cuda)
+  {
+    if (cuda->lib)
+    {
+      hc_dlclose (cuda->lib);
+    }
+
+    hcfree (backend_ctx->cuda);
+
+    backend_ctx->cuda = NULL;
+  }
+}
+
+int hc_cuInit (hashcat_ctx_t *hashcat_ctx, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuInit (Flags);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuInit(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuInit(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDeviceGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDeviceGetAttribute (pi, attrib, dev);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetAttribute(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDeviceGetCount (hashcat_ctx_t *hashcat_ctx, int *count)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDeviceGetCount (count);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetCount(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetCount(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDeviceGet (hashcat_ctx_t *hashcat_ctx, CUdevice* device, int ordinal)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDeviceGet (device, ordinal);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGet(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGet(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDeviceGetName (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevice dev)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDeviceGetName (name, len, dev);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetName(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetName(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDeviceTotalMem (hashcat_ctx_t *hashcat_ctx, size_t *bytes, CUdevice dev)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDeviceTotalMem (bytes, dev);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDeviceTotalMem(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDeviceTotalMem(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDriverGetVersion (hashcat_ctx_t *hashcat_ctx, int *driverVersion)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDriverGetVersion (driverVersion);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDriverGetVersion(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDriverGetVersion(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxCreate (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxCreate (pctx, flags, dev);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxCreate(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxDestroy (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxDestroy (ctx);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxDestroy(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuModuleLoadDataEx (module, image, numOptions, options, optionValues);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuModuleLoadDataEx(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuModuleLoadDataEx(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuModuleUnload (hashcat_ctx_t *hashcat_ctx, CUmodule hmod)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuModuleUnload (hmod);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuModuleUnload(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuModuleUnload(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxSetCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxSetCurrent (ctx);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCurrent(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCurrent(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemAlloc (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemAlloc (dptr, bytesize);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemAlloc(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemAlloc(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemFree (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemFree (dptr);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemFree(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemFree(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyDtoH (dstHost, srcDevice, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyDtoD (dstDevice, srcDevice, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyHtoD (dstDevice, srcHost, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuModuleGetFunction (hfunc, hmod, name);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuModuleGetFunction(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuModuleGetFunction(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuModuleGetGlobal (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuModuleGetGlobal (dptr, bytes, hmod, name);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemGetInfo (hashcat_ctx_t *hashcat_ctx, size_t *free, size_t *total)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemGetInfo (free, total);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemGetInfo(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemGetInfo(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuFuncGetAttribute (pi, attrib, hfunc);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuFuncGetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuFuncGetAttribute(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuFuncSetAttribute (hashcat_ctx_t *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuFuncSetAttribute (hfunc, attrib, value);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuStreamCreate (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuStreamCreate (phStream, Flags);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuStreamCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuStreamCreate(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuStreamDestroy (hashcat_ctx_t *hashcat_ctx, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuStreamDestroy (hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuStreamDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuStreamDestroy(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuStreamSynchronize (hashcat_ctx_t *hashcat_ctx, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuStreamSynchronize (hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuStreamSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuStreamSynchronize(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuLaunchKernel (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuLaunchKernel (f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLaunchKernel(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLaunchKernel(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxSynchronize (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxSynchronize ();
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxSynchronize(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventCreate (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventCreate (phEvent, Flags);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventCreate(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventDestroy (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventDestroy (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventDestroy(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventElapsedTime (pMilliseconds, hStart, hEnd);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventQuery (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventQuery (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventQuery(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventQuery(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventRecord (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventRecord (hEvent, hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventRecord(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventRecord(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventSynchronize (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventSynchronize (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventSynchronize(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxSetCacheConfig (hashcat_ctx_t *hashcat_ctx, CUfunc_cache config)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxSetCacheConfig (config);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+
+
+// OpenCL
+
+int ocl_init (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  memset (ocl, 0, sizeof (OCL_PTR));
+
+  #if   defined (_WIN)
+  ocl->lib = hc_dlopen ("OpenCL");
+  #elif defined (__APPLE__)
+  ocl->lib = hc_dlopen ("/System/Library/Frameworks/OpenCL.framework/OpenCL");
+  #elif defined (__CYGWIN__)
+  ocl->lib = hc_dlopen ("opencl.dll");
+
+  if (ocl->lib == NULL) ocl->lib = hc_dlopen ("cygOpenCL-1.dll");
+  #else
+  ocl->lib = hc_dlopen ("libOpenCL.so");
+
+  if (ocl->lib == NULL) ocl->lib = hc_dlopen ("libOpenCL.so.1");
+  #endif
+
+  if (ocl->lib == NULL) return -1;
+
+  HC_LOAD_FUNC (ocl, clBuildProgram,            OCL_CLBUILDPROGRAM,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateBuffer,            OCL_CLCREATEBUFFER,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateCommandQueue,      OCL_CLCREATECOMMANDQUEUE,       OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateContext,           OCL_CLCREATECONTEXT,            OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateKernel,            OCL_CLCREATEKERNEL,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateProgramWithBinary, OCL_CLCREATEPROGRAMWITHBINARY,  OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateProgramWithSource, OCL_CLCREATEPROGRAMWITHSOURCE,  OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueCopyBuffer,       OCL_CLENQUEUECOPYBUFFER,        OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueMapBuffer,        OCL_CLENQUEUEMAPBUFFER,         OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueNDRangeKernel,    OCL_CLENQUEUENDRANGEKERNEL,     OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueReadBuffer,       OCL_CLENQUEUEREADBUFFER,        OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueUnmapMemObject,   OCL_CLENQUEUEUNMAPMEMOBJECT,    OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueWriteBuffer,      OCL_CLENQUEUEWRITEBUFFER,       OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clFinish,                  OCL_CLFINISH,                   OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clFlush,                   OCL_CLFLUSH,                    OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetDeviceIDs,            OCL_CLGETDEVICEIDS,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetDeviceInfo,           OCL_CLGETDEVICEINFO,            OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetEventInfo,            OCL_CLGETEVENTINFO,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetKernelWorkGroupInfo,  OCL_CLGETKERNELWORKGROUPINFO,   OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetPlatformIDs,          OCL_CLGETPLATFORMIDS,           OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetPlatformInfo,         OCL_CLGETPLATFORMINFO,          OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetProgramBuildInfo,     OCL_CLGETPROGRAMBUILDINFO,      OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetProgramInfo,          OCL_CLGETPROGRAMINFO,           OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseCommandQueue,     OCL_CLRELEASECOMMANDQUEUE,      OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseContext,          OCL_CLRELEASECONTEXT,           OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseKernel,           OCL_CLRELEASEKERNEL,            OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseMemObject,        OCL_CLRELEASEMEMOBJECT,         OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseProgram,          OCL_CLRELEASEPROGRAM,           OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clSetKernelArg,            OCL_CLSETKERNELARG,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clWaitForEvents,           OCL_CLWAITFOREVENTS,            OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetEventProfilingInfo,   OCL_CLGETEVENTPROFILINGINFO,    OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseEvent,            OCL_CLRELEASEEVENT,             OpenCL, 1);
+
+  return 0;
+}
+
+void ocl_close (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  if (ocl)
+  {
+    if (ocl->lib)
+    {
+      hc_dlclose (ocl->lib);
+    }
+
+    hcfree (backend_ctx->ocl);
+
+    backend_ctx->ocl = NULL;
+  }
+}
+
+int hc_clEnqueueNDRangeKernel (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clEnqueueNDRangeKernel (command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueNDRangeKernel(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clGetEventInfo (hashcat_ctx_t *hashcat_ctx, cl_event event, cl_event_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clGetEventInfo (event, param_name, param_value_size, param_value, param_value_size_ret);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetEventInfo(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clFlush (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clFlush (command_queue);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clFlush(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clFinish (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clFinish (command_queue);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clFinish(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clSetKernelArg (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clSetKernelArg (kernel, arg_index, arg_size, arg_value);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clSetKernelArg(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clEnqueueWriteBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t size, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clEnqueueWriteBuffer (command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueWriteBuffer(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clEnqueueCopyBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clEnqueueCopyBuffer (command_queue, src_buffer, dst_buffer, src_offset, dst_offset, size, num_events_in_wait_list, event_wait_list, event);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueCopyBuffer(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clEnqueueReadBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t size, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clEnqueueReadBuffer (command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueReadBuffer(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clGetPlatformIDs (hashcat_ctx_t *hashcat_ctx, cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clGetPlatformIDs (num_entries, platforms, num_platforms);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetPlatformIDs(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clGetPlatformInfo (hashcat_ctx_t *hashcat_ctx, cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clGetPlatformInfo (platform, param_name, param_value_size, param_value, param_value_size_ret);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetPlatformInfo(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clGetDeviceIDs (hashcat_ctx_t *hashcat_ctx, cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clGetDeviceIDs (platform, device_type, num_entries, devices, num_devices);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetDeviceIDs(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clGetDeviceInfo (hashcat_ctx_t *hashcat_ctx, cl_device_id device, cl_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clGetDeviceInfo (device, param_name, param_value_size, param_value, param_value_size_ret);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetDeviceInfo(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clCreateContext (hashcat_ctx_t *hashcat_ctx, const cl_context_properties *properties, cl_uint num_devices, const cl_device_id *devices, void (CL_CALLBACK *pfn_notify) (const char *errinfo, const void *private_info, size_t cb, void *user_data), void *user_data, cl_context *context)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  cl_int CL_err;
+
+  *context = ocl->clCreateContext (properties, num_devices, devices, pfn_notify, user_data, &CL_err);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clCreateContext(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clCreateCommandQueue (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_device_id device, cl_command_queue_properties properties, cl_command_queue *command_queue)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  cl_int CL_err;
+
+  *command_queue = ocl->clCreateCommandQueue (context, device, properties, &CL_err);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clCreateCommandQueue(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clCreateBuffer (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_mem *mem)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  cl_int CL_err;
+
+  *mem = ocl->clCreateBuffer (context, flags, size, host_ptr, &CL_err);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clCreateBuffer(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clCreateProgramWithSource (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_program *program)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  cl_int CL_err;
+
+  *program = ocl->clCreateProgramWithSource (context, count, strings, lengths, &CL_err);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clCreateProgramWithSource(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clCreateProgramWithBinary (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_uint num_devices, const cl_device_id *device_list, const size_t *lengths, const unsigned char **binaries, cl_int *binary_status, cl_program *program)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  cl_int CL_err;
+
+  *program = ocl->clCreateProgramWithBinary (context, num_devices, device_list, lengths, (const unsigned char **) binaries, binary_status, &CL_err);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clCreateProgramWithBinary(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clBuildProgram (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clBuildProgram (program, num_devices, device_list, options, pfn_notify, user_data);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clBuildProgram(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clCreateKernel (hashcat_ctx_t *hashcat_ctx, cl_program program, const char *kernel_name, cl_kernel *kernel)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  cl_int CL_err;
+
+  *kernel = ocl->clCreateKernel (program, kernel_name, &CL_err);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clCreateKernel(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clReleaseMemObject (hashcat_ctx_t *hashcat_ctx, cl_mem mem)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clReleaseMemObject (mem);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clReleaseMemObject(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clReleaseKernel (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clReleaseKernel (kernel);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clReleaseKernel(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clReleaseProgram (hashcat_ctx_t *hashcat_ctx, cl_program program)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clReleaseProgram (program);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clReleaseProgram(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clReleaseCommandQueue (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clReleaseCommandQueue (command_queue);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clReleaseCommandQueue(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clReleaseContext (hashcat_ctx_t *hashcat_ctx, cl_context context)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clReleaseContext (context);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clReleaseContext(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clEnqueueMapBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event, void **buf)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  cl_int CL_err;
+
+  *buf = ocl->clEnqueueMapBuffer (command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, &CL_err);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueMapBuffer(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clEnqueueUnmapMemObject (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clEnqueueUnmapMemObject (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueUnmapMemObject(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clGetKernelWorkGroupInfo (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clGetKernelWorkGroupInfo (kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetKernelWorkGroupInfo(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clGetProgramBuildInfo (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_device_id device, cl_program_build_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clGetProgramBuildInfo (program, device, param_name, param_value_size, param_value, param_value_size_ret);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetProgramBuildInfo(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clGetProgramInfo (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_program_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clGetProgramInfo (program, param_name, param_value_size, param_value, param_value_size_ret);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetProgramInfo(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clWaitForEvents (hashcat_ctx_t *hashcat_ctx, cl_uint num_events, const cl_event *event_list)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clWaitForEvents (num_events, event_list);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clWaitForEvents(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clGetEventProfilingInfo (hashcat_ctx_t *hashcat_ctx, cl_event event, cl_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clGetEventProfilingInfo (event, param_name, param_value_size, param_value, param_value_size_ret);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetEventProfilingInfo(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_clReleaseEvent (hashcat_ctx_t *hashcat_ctx, cl_event event)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  OCL_PTR *ocl = backend_ctx->ocl;
+
+  const cl_int CL_err = ocl->clReleaseEvent (event);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clReleaseEvent(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+// Backend
+
+int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 gidd, pw_t *pw)
+{
+  pw_idx_t pw_idx;
+
+  pw_idx.off = 0;
+  pw_idx.cnt = 0;
+  pw_idx.len = 0;
+
+  if (device_param->is_cuda == true)
+  {
+    const int CU_rc = hc_cuMemcpyDtoH (hashcat_ctx, &pw_idx, device_param->cuda_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t));
+
+    if (CU_rc == -1) return -1;
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    const int CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, gidd * sizeof (pw_idx_t), sizeof (pw_idx_t), &pw_idx, 0, NULL, NULL);
+
+    if (CL_rc == -1) return -1;
+  }
+
+  const u32 off = pw_idx.off;
+  const u32 cnt = pw_idx.cnt;
+  const u32 len = pw_idx.len;
+
+  if (device_param->is_cuda == true)
+  {
+    if (cnt > 0)
+    {
+      const int CU_rc = hc_cuMemcpyDtoH (hashcat_ctx,pw->i, device_param->cuda_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32));
+
+      if (CU_rc == -1) return -1;
+    }
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    if (cnt > 0)
+    {
+      const int CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, off * sizeof (u32), cnt * sizeof (u32), pw->i, 0, NULL, NULL);
+
+      if (CL_rc == -1) return -1;
+    }
+  }
+
+  for (u32 i = cnt; i < 64; i++)
+  {
+    pw->i[i] = 0;
+  }
+
+  pw->pw_len = len;
+
+  return 0;
+}
+
+int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 highest_pw_len, const u64 pws_cnt, const u32 fast_iteration, const u32 salt_pos)
+{
+  hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
+  hashes_t       *hashes       = hashcat_ctx->hashes;
+  module_ctx_t   *module_ctx   = hashcat_ctx->module_ctx;
+  status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
+  user_options_t *user_options = hashcat_ctx->user_options;
+
+  if (user_options->stdout_flag == true)
+  {
+    return process_stdout (hashcat_ctx, device_param, pws_cnt);
+  }
+
+  int CU_rc;
+  int CL_rc;
+
+  if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+  {
+    if (user_options->attack_mode == ATTACK_MODE_BF)
+    {
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        if (hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE)
+        {
+          const u32 size_tm = 32 * sizeof (bs_word_t);
+
+          if (device_param->is_cuda == true)
+          {
+            CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tm_c, size_tm);
+
+            if (CU_rc == -1) return -1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_tm_c, size_tm);
+
+            if (CL_rc == -1) return -1;
+          }
+
+          const int rc_kernel_tm = run_kernel_tm (hashcat_ctx, device_param);
+
+          if (rc_kernel_tm == -1) return -1;
+
+          if (device_param->is_cuda == true)
+          {
+            const int CU_rc = hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_tm_c, size_tm);
+
+            if (CU_rc == -1) return -1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tm_c, device_param->opencl_d_bfs_c, 0, 0, size_tm, 0, NULL, NULL);
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+      }
+    }
+
+    if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+    {
+      if (highest_pw_len < 16)
+      {
+        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_1, pws_cnt, true, fast_iteration);
+
+        if (CL_rc == -1) return -1;
+      }
+      else if (highest_pw_len < 32)
+      {
+        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_cnt, true, fast_iteration);
+
+        if (CL_rc == -1) return -1;
+      }
+      else
+      {
+        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_3, pws_cnt, true, fast_iteration);
+
+        if (CL_rc == -1) return -1;
+      }
+    }
+    else
+    {
+      CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_4, pws_cnt, true, fast_iteration);
+
+      if (CL_rc == -1) return -1;
+    }
+  }
+  else
+  {
+    bool run_init = true;
+    bool run_loop = true;
+    bool run_comp = true;
+
+    if (run_init == true)
+    {
+      if (device_param->is_cuda == true)
+      {
+        CU_rc = hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_pws_buf, device_param->cuda_d_pws_amp_buf, pws_cnt * sizeof (pw_t));
+
+        if (CU_rc == -1) return -1;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_amp_buf, device_param->opencl_d_pws_buf, 0, 0, pws_cnt * sizeof (pw_t), 0, NULL, NULL);
+
+        if (CL_rc == -1) return -1;
+      }
+
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        CL_rc = run_kernel_amp (hashcat_ctx, device_param, pws_cnt);
+
+        if (CL_rc == -1) return -1;
+      }
+
+      const int rc_kernel = run_kernel (hashcat_ctx, device_param, KERN_RUN_1, pws_cnt, false, 0);
+
+      if (rc_kernel == -1) return -1;
+
+      if (hashconfig->opts_type & OPTS_TYPE_HOOK12)
+      {
+        const int rc_kernel = run_kernel (hashcat_ctx, device_param, KERN_RUN_12, pws_cnt, false, 0);
+
+        if (rc_kernel == -1) return -1;
+
+        if (device_param->is_cuda == true)
+        {
+          CU_rc = hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks);
+
+          if (CU_rc == -1) return -1;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+        }
+
+        module_ctx->module_hook12 (device_param, hashes->hook_salts_buf, salt_pos, pws_cnt);
+
+        if (device_param->is_cuda == true)
+        {
+          CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks);
+
+          if (CU_rc == -1) return -1;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+        }
+      }
+    }
+
+    if (run_loop == true)
+    {
+      u32 iter = hashes->salts_buf[salt_pos].salt_iter;
+
+      u32 loop_step = device_param->kernel_loops;
+
+      for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
+      {
+        u32 loop_left = iter - loop_pos;
+
+        loop_left = MIN (loop_left, loop_step);
+
+        device_param->kernel_params_buf32[28] = loop_pos;
+        device_param->kernel_params_buf32[29] = loop_left;
+
+        const int rc_kernel = run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_cnt, true, slow_iteration);
+
+        if (rc_kernel == -1) return -1;
+
+        //bug?
+        //while (status_ctx->run_thread_level2 == false) break;
+        if (status_ctx->run_thread_level2 == false) break;
+
+        /**
+         * speed
+         */
+
+        const float iter_part = (float) (loop_pos + loop_left) / iter;
+
+        const u64 perf_sum_all = (u64) (pws_cnt * iter_part);
+
+        double speed_msec = hc_timer_get (device_param->timer_speed);
+
+        const u32 speed_pos = device_param->speed_pos;
+
+        device_param->speed_cnt[speed_pos] = perf_sum_all;
+
+        device_param->speed_msec[speed_pos] = speed_msec;
+
+        if (user_options->speed_only == true)
+        {
+          if (speed_msec > 4000)
+          {
+            device_param->outerloop_multi *= (double) iter / (double) (loop_pos + loop_left);
+
+            device_param->speed_pos = 1;
+
+            device_param->speed_only_finish = true;
+
+            return 0;
+          }
+        }
+      }
+
+      if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
+      {
+        const int rc_kernel = run_kernel (hashcat_ctx, device_param, KERN_RUN_23, pws_cnt, false, 0);
+
+        if (rc_kernel == -1) return -1;
+
+        if (device_param->is_cuda == true)
+        {
+          CU_rc = hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks);
+
+          if (CU_rc == -1) return -1;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+        }
+
+        module_ctx->module_hook23 (device_param, hashes->hook_salts_buf, salt_pos, pws_cnt);
+
+        if (device_param->is_cuda == true)
+        {
+          CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks);
+
+          if (CU_rc == -1) return -1;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+        }
+      }
+    }
+
+    // init2 and loop2 are kind of special, we use run_loop for them, too
+
+    if (run_loop == true)
+    {
+      // note: they also do not influence the performance screen
+      // in case you want to use this, this cane make sense only if your input data comes out of tmps[]
+
+      if (hashconfig->opts_type & OPTS_TYPE_INIT2)
+      {
+        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_INIT2, pws_cnt, false, 0);
+
+        if (CL_rc == -1) return -1;
+      }
+
+      if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
+      {
+        u32 iter = hashes->salts_buf[salt_pos].salt_iter2;
+
+        u32 loop_step = device_param->kernel_loops;
+
+        for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
+        {
+          u32 loop_left = iter - loop_pos;
+
+          loop_left = MIN (loop_left, loop_step);
+
+          device_param->kernel_params_buf32[28] = loop_pos;
+          device_param->kernel_params_buf32[29] = loop_left;
+
+          CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2, pws_cnt, true, slow_iteration);
+
+          if (CL_rc == -1) return -1;
+
+          //bug?
+          //while (status_ctx->run_thread_level2 == false) break;
+          if (status_ctx->run_thread_level2 == false) break;
+        }
+      }
+    }
+
+    if (run_comp == true)
+    {
+      if (hashconfig->opts_type & OPTS_TYPE_DEEP_COMP_KERNEL)
+      {
+        const u32 loops_cnt = hashes->salts_buf[salt_pos].digests_cnt;
+
+        for (u32 loops_pos = 0; loops_pos < loops_cnt; loops_pos++)
+        {
+          device_param->kernel_params_buf32[28] = loops_pos;
+          device_param->kernel_params_buf32[29] = loops_cnt;
+
+          const u32 deep_comp_kernel = module_ctx->module_deep_comp_kernel (hashes, salt_pos, loops_pos);
+
+          CL_rc = run_kernel (hashcat_ctx, device_param, deep_comp_kernel, pws_cnt, false, 0);
+
+          if (CL_rc == -1) return -1;
+
+          if (status_ctx->run_thread_level2 == false) break;
+        }
+      }
+      else
+      {
+        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_3, pws_cnt, false, 0);
+
+        if (CL_rc == -1) return -1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+void rebuild_pws_compressed_append (hc_device_param_t *device_param, const u64 pws_cnt, const u8 chr)
+{
+  // this function is used if we have to modify the compressed pws buffer in order to
+  // append some data to each password candidate
+
+  u32      *tmp_pws_comp = (u32 *)      hcmalloc (device_param->size_pws_comp);
+  pw_idx_t *tmp_pws_idx  = (pw_idx_t *) hcmalloc (device_param->size_pws_idx);
+
+  for (u32 i = 0; i < pws_cnt; i++)
+  {
+    pw_idx_t *pw_idx_src = device_param->pws_idx + i;
+    pw_idx_t *pw_idx_dst = tmp_pws_idx + i;
+
+    const u32 src_off = pw_idx_src->off;
+    const u32 src_len = pw_idx_src->len;
+
+    u8 buf[256];
+
+    memcpy (buf, device_param->pws_comp + src_off, src_len);
+
+    buf[src_len] = chr;
+
+    const u32 dst_len = src_len + 1;
+
+    const u32 dst_pw_len4 = (dst_len + 3) & ~3; // round up to multiple of 4
+
+    const u32 dst_pw_len4_cnt = dst_pw_len4 / 4;
+
+    pw_idx_dst->cnt = dst_pw_len4_cnt;
+    pw_idx_dst->len = src_len; // this is intenionally! src_len can not be dst_len, we dont want the kernel to think 0x80 is part of the password
+
+    u8 *dst = (u8 *) (tmp_pws_comp + pw_idx_dst->off);
+
+    memcpy (dst, buf, dst_len);
+
+    memset (dst + dst_len, 0, dst_pw_len4 - dst_len);
+
+    // prepare next element
+
+    pw_idx_t *pw_idx_dst_next = pw_idx_dst + 1;
+
+    pw_idx_dst_next->off = pw_idx_dst->off + pw_idx_dst->cnt;
+  }
+
+  memcpy (device_param->pws_comp, tmp_pws_comp, device_param->size_pws_comp);
+  memcpy (device_param->pws_idx,  tmp_pws_idx,  device_param->size_pws_idx);
+
+  hcfree (tmp_pws_comp);
+  hcfree (tmp_pws_idx);
+}
+
+int run_cuda_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num)
+{
+  u64 num_elements = num;
+
+  device_param->kernel_params_atinit[0]       = (void *) &buf;
+  device_param->kernel_params_atinit_buf64[1] = num_elements;
+
+  const u64 kernel_threads = device_param->kernel_wgs_atinit;
+
+  num_elements = CEILDIV (num_elements, kernel_threads);
+
+  CUfunction function = device_param->cuda_function_atinit;
+
+  const int rc_cuLaunchKernel = hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_atinit, NULL);
+
+  if (rc_cuLaunchKernel == -1) return -1;
+
+  const int rc_cuStreamSynchronize = hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream);
+
+  if (rc_cuStreamSynchronize == -1) return -1;
+
+  return 0;
+}
+
+int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u32 value, const u64 size)
+{
+  const u64 num16d = size / 16;
+  const u64 num16m = size % 16;
+
+  if (num16d)
+  {
+    device_param->kernel_params_memset[0]       = (void *) &buf;
+    device_param->kernel_params_memset_buf32[1] = value;
+    device_param->kernel_params_memset_buf64[2] = num16d;
+
+    const u64 kernel_threads = device_param->kernel_wgs_memset;
+
+    u64 num_elements = num16d;
+
+    num_elements = CEILDIV (num_elements, kernel_threads);
+
+    CUfunction function = device_param->cuda_function_memset;
+
+    //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem),   (void *) &buf);                         if (CU_rc == -1) return -1;
+    //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (CU_rc == -1) return -1;
+    //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CU_rc == -1) return -1;
+
+    //const size_t global_work_size[3] = { num_elements,   1, 1 };
+    //const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+    const int rc_cuLaunchKernel = hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_memset, NULL);
+
+    if (rc_cuLaunchKernel == -1) return -1;
+
+    const int rc_cuStreamSynchronize = hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream);
+
+    if (rc_cuStreamSynchronize == -1) return -1;
+  }
+
+  if (num16m)
+  {
+    u32 tmp[4];
+
+    tmp[0] = value;
+    tmp[1] = value;
+    tmp[2] = value;
+    tmp[3] = value;
+
+    // Apparently are allowed to do this: https://devtalk.nvidia.com/default/topic/761515/how-to-copy-to-device-memory-with-offset-/
+
+    const int rc_cuMemcpyHtoD = hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m);
+
+    if (rc_cuMemcpyHtoD == -1) return -1;
+  }
+
+  return 0;
+}
+
+int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 size)
+{
+  return run_cuda_kernel_memset (hashcat_ctx, device_param, buf, 0, size);
+}
+
+int run_opencl_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num)
+{
+  u64 num_elements = num;
+
+  device_param->kernel_params_atinit_buf64[1] = num_elements;
+
+  const u64 kernel_threads = device_param->kernel_wgs_atinit;
+
+  num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+
+  cl_kernel kernel = device_param->opencl_kernel_atinit;
+
+  const size_t global_work_size[3] = { num_elements,    1, 1 };
+  const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
+
+  int CL_rc;
+
+  CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem), (void *) &buf);
+
+  if (CL_rc == -1) return -1;
+
+  CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]);
+
+  if (CL_rc == -1) return -1;
+
+  CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+
+  if (CL_rc == -1) return -1;
+
+  CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
+
+  if (CL_rc == -1) return -1;
+
+  CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);
+
+  if (CL_rc == -1) return -1;
+
+  return 0;
+}
+
+int run_opencl_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size)
+{
+  const u64 num16d = size / 16;
+  const u64 num16m = size % 16;
+
+  if (num16d)
+  {
+    device_param->kernel_params_memset_buf32[1] = value;
+    device_param->kernel_params_memset_buf64[2] = num16d;
+
+    const u64 kernel_threads = device_param->kernel_wgs_memset;
+
+    u64 num_elements = num16d;
+
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+
+    cl_kernel kernel = device_param->opencl_kernel_memset;
+
+    int CL_rc;
+
+    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem),   (void *) &buf);                         if (CL_rc == -1) return -1;
+    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1;
+    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1;
+
+    const size_t global_work_size[3] = { num_elements,   1, 1 };
+    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+  }
+
+  if (num16m)
+  {
+    u32 tmp[4];
+
+    tmp[0] = value;
+    tmp[1] = value;
+    tmp[2] = value;
+    tmp[3] = value;
+
+    int CL_rc;
+
+    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, buf, CL_TRUE, num16d * 16, num16m, tmp, 0, NULL, NULL);
+
+    if (CL_rc == -1) return -1;
+  }
+
+  return 0;
+}
+
+int run_opencl_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size)
+{
+  return run_opencl_kernel_memset (hashcat_ctx, device_param, buf, 0, size);
+}
+
+int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num, const u32 event_update, const u32 iteration)
+{
+  const hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
+  const status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
+  const user_options_t *user_options = hashcat_ctx->user_options;
+
+  u64 kernel_threads = 0;
+
+  switch (kern_run)
+  {
+    case KERN_RUN_1:      kernel_threads  = device_param->kernel_wgs1;      break;
+    case KERN_RUN_12:     kernel_threads  = device_param->kernel_wgs12;     break;
+    case KERN_RUN_2:      kernel_threads  = device_param->kernel_wgs2;      break;
+    case KERN_RUN_23:     kernel_threads  = device_param->kernel_wgs23;     break;
+    case KERN_RUN_3:      kernel_threads  = device_param->kernel_wgs3;      break;
+    case KERN_RUN_4:      kernel_threads  = device_param->kernel_wgs4;      break;
+    case KERN_RUN_INIT2:  kernel_threads  = device_param->kernel_wgs_init2; break;
+    case KERN_RUN_LOOP2:  kernel_threads  = device_param->kernel_wgs_loop2; break;
+    case KERN_RUN_AUX1:   kernel_threads  = device_param->kernel_wgs_aux1;  break;
+    case KERN_RUN_AUX2:   kernel_threads  = device_param->kernel_wgs_aux2;  break;
+    case KERN_RUN_AUX3:   kernel_threads  = device_param->kernel_wgs_aux3;  break;
+    case KERN_RUN_AUX4:   kernel_threads  = device_param->kernel_wgs_aux4;  break;
+  }
+
+  kernel_threads = MIN (kernel_threads, device_param->kernel_threads);
+
+  device_param->kernel_params_buf64[34] = num;
+
+  u64 num_elements = num;
+
+  if (device_param->is_cuda == true)
+  {
+    CUfunction cuda_function = NULL;
+
+    if (device_param->is_cuda == true)
+    {
+      switch (kern_run)
+      {
+        case KERN_RUN_1:      cuda_function = device_param->cuda_function1;      break;
+        case KERN_RUN_12:     cuda_function = device_param->cuda_function12;     break;
+        case KERN_RUN_2:      cuda_function = device_param->cuda_function2;      break;
+        case KERN_RUN_23:     cuda_function = device_param->cuda_function23;     break;
+        case KERN_RUN_3:      cuda_function = device_param->cuda_function3;      break;
+        case KERN_RUN_4:      cuda_function = device_param->cuda_function4;      break;
+        case KERN_RUN_INIT2:  cuda_function = device_param->cuda_function_init2; break;
+        case KERN_RUN_LOOP2:  cuda_function = device_param->cuda_function_loop2; break;
+        case KERN_RUN_AUX1:   cuda_function = device_param->cuda_function_aux1;  break;
+        case KERN_RUN_AUX2:   cuda_function = device_param->cuda_function_aux2;  break;
+        case KERN_RUN_AUX3:   cuda_function = device_param->cuda_function_aux3;  break;
+        case KERN_RUN_AUX4:   cuda_function = device_param->cuda_function_aux4;  break;
+      }
+    }
+
+    num_elements = CEILDIV (num_elements, kernel_threads);
+
+    if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF))
+    {
+      const int rc_cuEventRecord1 = hc_cuEventRecord (hashcat_ctx, device_param->cuda_event1, device_param->cuda_stream);
+
+      if (rc_cuEventRecord1 == -1) return -1;
+
+      const int rc_cuLaunchKernel = hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 32, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params, NULL);
+
+      if (rc_cuLaunchKernel == -1) return -1;
+
+      const int rc_cuEventRecord2 = hc_cuEventRecord (hashcat_ctx, device_param->cuda_event2, device_param->cuda_stream);
+
+      if (rc_cuEventRecord2 == -1) return -1;
+    }
+    else
+    {
+      if (kern_run == KERN_RUN_1)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+      else if (kern_run == KERN_RUN_2)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+      else if (kern_run == KERN_RUN_3)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+
+      const int rc_cuEventRecord1 = hc_cuEventRecord (hashcat_ctx, device_param->cuda_event1, device_param->cuda_stream);
+
+      if (rc_cuEventRecord1 == -1) return -1;
+
+      const int rc_cuLaunchKernel = hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params, NULL);
+
+      if (rc_cuLaunchKernel == -1) return -1;
+
+      const int rc_cuEventRecord2 = hc_cuEventRecord (hashcat_ctx, device_param->cuda_event2, device_param->cuda_stream);
+
+      if (rc_cuEventRecord2 == -1) return -1;
+    }
+
+    const int rc_cuStreamSynchronize = hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream);
+
+    if (rc_cuStreamSynchronize == -1) return -1;
+
+    const int rc_cuEventSynchronize = hc_cuEventSynchronize (hashcat_ctx, device_param->cuda_event2);
+
+    if (rc_cuEventSynchronize == -1) return -1;
+
+    float exec_ms;
+
+    const int rc_cuEventElapsedTime = hc_cuEventElapsedTime (hashcat_ctx, &exec_ms, device_param->cuda_event1, device_param->cuda_event2);
+
+    if (rc_cuEventElapsedTime == -1) return -1;
+
+    if (event_update)
+    {
+      u32 exec_pos = device_param->exec_pos;
+
+      device_param->exec_msec[exec_pos] = exec_ms;
+
+      exec_pos++;
+
+      if (exec_pos == EXEC_CACHE)
+      {
+        exec_pos = 0;
+      }
+
+      device_param->exec_pos = exec_pos;
+    }
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    int CL_rc;
+
+    cl_kernel opencl_kernel = NULL;
+
+    if (device_param->is_opencl == true)
+    {
+      switch (kern_run)
+      {
+        case KERN_RUN_1:      opencl_kernel = device_param->opencl_kernel1;      break;
+        case KERN_RUN_12:     opencl_kernel = device_param->opencl_kernel12;     break;
+        case KERN_RUN_2:      opencl_kernel = device_param->opencl_kernel2;      break;
+        case KERN_RUN_23:     opencl_kernel = device_param->opencl_kernel23;     break;
+        case KERN_RUN_3:      opencl_kernel = device_param->opencl_kernel3;      break;
+        case KERN_RUN_4:      opencl_kernel = device_param->opencl_kernel4;      break;
+        case KERN_RUN_INIT2:  opencl_kernel = device_param->opencl_kernel_init2; break;
+        case KERN_RUN_LOOP2:  opencl_kernel = device_param->opencl_kernel_loop2; break;
+        case KERN_RUN_AUX1:   opencl_kernel = device_param->opencl_kernel_aux1;  break;
+        case KERN_RUN_AUX2:   opencl_kernel = device_param->opencl_kernel_aux2;  break;
+        case KERN_RUN_AUX3:   opencl_kernel = device_param->opencl_kernel_aux3;  break;
+        case KERN_RUN_AUX4:   opencl_kernel = device_param->opencl_kernel_aux4;  break;
+      }
+    }
+
+    for (u32 i = 0; i <= 23; i++)
+    {
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_mem), device_param->kernel_params[i]);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    for (u32 i = 24; i <= 33; i++)
+    {
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_uint), device_param->kernel_params[i]);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    for (u32 i = 34; i <= 34; i++)
+    {
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+
+    cl_event opencl_event;
+
+    if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF))
+    {
+      const size_t global_work_size[3] = { num_elements,  32, 1 };
+      const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+      CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event);
+
+      if (CL_rc == -1) return -1;
+    }
+    else
+    {
+      if (kern_run == KERN_RUN_1)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+      else if (kern_run == KERN_RUN_2)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+      else if (kern_run == KERN_RUN_3)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+
+      num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+
+      const size_t global_work_size[3] = { num_elements,   1, 1 };
+      const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+      CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+
+    // spin damper section
+
+    const u32 iterationm = iteration % EXPECTED_ITERATIONS;
+
+    cl_int opencl_event_status;
+
+    size_t param_value_size_ret;
+
+    CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret);
+
+    if (CL_rc == -1) return -1;
+
+    if (device_param->spin_damp > 0)
+    {
+      double spin_total = device_param->spin_damp;
+
+      while (opencl_event_status != CL_COMPLETE)
+      {
+        if (status_ctx->devices_status == STATUS_RUNNING)
+        {
+          switch (kern_run)
+          {
+            case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]  * device_param->spin_damp)); break;
+          }
+        }
+        else
+        {
+          // we were told to be nice
+
+          sleep (0);
+        }
+
+        CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret);
+
+        if (CL_rc == -1) return -1;
+
+        spin_total += device_param->spin_damp;
+
+        if (spin_total > 1) break;
+      }
+    }
+
+    CL_rc = hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event);
+
+    if (CL_rc == -1) return -1;
+
+    cl_ulong time_start;
+    cl_ulong time_end;
+
+    CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL); if (CL_rc == -1) return -1;
+    CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_END,   sizeof (time_end),   &time_end,   NULL); if (CL_rc == -1) return -1;
+
+    const double exec_us = (double) (time_end - time_start) / 1000;
+
+    if (device_param->spin_damp > 0)
+    {
+      if (status_ctx->devices_status == STATUS_RUNNING)
+      {
+        switch (kern_run)
+        {
+          case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]      = exec_us; break;
+          case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]      = exec_us; break;
+          case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]      = exec_us; break;
+          case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]      = exec_us; break;
+          case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm] = exec_us; break;
+          case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm] = exec_us; break;
+          case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]  = exec_us; break;
+        }
+      }
+    }
+
+    if (event_update)
+    {
+      u32 exec_pos = device_param->exec_pos;
+
+      device_param->exec_msec[exec_pos] = exec_us / 1000;
+
+      exec_pos++;
+
+      if (exec_pos == EXEC_CACHE)
+      {
+        exec_pos = 0;
+      }
+
+      device_param->exec_pos = exec_pos;
+    }
+
+    CL_rc = hc_clReleaseEvent (hashcat_ctx, opencl_event);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+  }
+
+  return 0;
+}
+
+int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num)
+{
+  u64 kernel_threads = 0;
+
+  switch (kern_run)
+  {
+    case KERN_RUN_MP:   kernel_threads  = device_param->kernel_wgs_mp;    break;
+    case KERN_RUN_MP_R: kernel_threads  = device_param->kernel_wgs_mp_r;  break;
+    case KERN_RUN_MP_L: kernel_threads  = device_param->kernel_wgs_mp_l;  break;
+  }
+
+  u64 num_elements = num;
+
+  switch (kern_run)
+  {
+    case KERN_RUN_MP:   device_param->kernel_params_mp_buf64[8]   = num; break;
+    case KERN_RUN_MP_R: device_param->kernel_params_mp_r_buf64[8] = num; break;
+    case KERN_RUN_MP_L: device_param->kernel_params_mp_l_buf64[9] = num; break;
+  }
+
+  if (device_param->is_cuda == true)
+  {
+    CUfunction cuda_function = NULL;
+
+    void **cuda_args = NULL;
+
+    switch (kern_run)
+    {
+      case KERN_RUN_MP:   cuda_function = device_param->cuda_function_mp;
+                          cuda_args     = device_param->kernel_params_mp;
+                          break;
+      case KERN_RUN_MP_R: cuda_function = device_param->cuda_function_mp_r;
+                          cuda_args     = device_param->kernel_params_mp_r;
+                          break;
+      case KERN_RUN_MP_L: cuda_function = device_param->cuda_function_mp_l;
+                          cuda_args     = device_param->kernel_params_mp_l;
+                          break;
+    }
+
+    num_elements = CEILDIV (num_elements, kernel_threads);
+
+    const int rc_cuLaunchKernel = hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, cuda_args, NULL);
+
+    if (rc_cuLaunchKernel == -1) return -1;
+
+    const int rc_cuStreamSynchronize = hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream);
+
+    if (rc_cuStreamSynchronize == -1) return -1;
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    int CL_rc;
+
+    cl_kernel opencl_kernel = NULL;
+
+    switch (kern_run)
+    {
+      case KERN_RUN_MP:   opencl_kernel = device_param->opencl_kernel_mp;   break;
+      case KERN_RUN_MP_R: opencl_kernel = device_param->opencl_kernel_mp_r; break;
+      case KERN_RUN_MP_L: opencl_kernel = device_param->opencl_kernel_mp_l; break;
+    }
+
+    switch (kern_run)
+    {
+      case KERN_RUN_MP:   CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp[3]);   if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp[4]);   if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp[5]);   if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp[6]);   if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp[7]);   if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 8, sizeof (cl_ulong), device_param->kernel_params_mp[8]);   if (CL_rc == -1) return -1;
+                          break;
+      case KERN_RUN_MP_R: CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp_r[3]); if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp_r[4]); if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp_r[5]); if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp_r[6]); if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp_r[7]); if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 8, sizeof (cl_ulong), device_param->kernel_params_mp_r[8]); if (CL_rc == -1) return -1;
+                          break;
+      case KERN_RUN_MP_L: CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp_l[3]); if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp_l[4]); if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp_l[5]); if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp_l[6]); if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp_l[7]); if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 8, sizeof (cl_uint),  device_param->kernel_params_mp_l[8]); if (CL_rc == -1) return -1;
+                          CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 9, sizeof (cl_ulong), device_param->kernel_params_mp_l[9]); if (CL_rc == -1) return -1;
+                          break;
+    }
+
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+
+    const size_t global_work_size[3] = { num_elements,   1, 1 };
+    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+  }
+
+  return 0;
+}
+
+int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
+{
+  const u64 num_elements = 1024; // fixed
+
+  const u64 kernel_threads = MIN (num_elements, device_param->kernel_wgs_tm);
+
+  if (device_param->is_cuda == true)
+  {
+    CUfunction cuda_function = device_param->cuda_function_tm;
+
+    const int rc_cuLaunchKernel = hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements / kernel_threads, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_tm, NULL);
+
+    if (rc_cuLaunchKernel == -1) return -1;
+
+    const int rc_cuStreamSynchronize = hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream);
+
+    if (rc_cuStreamSynchronize == -1) return -1;
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    cl_kernel cuda_kernel = device_param->opencl_kernel_tm;
+
+    const size_t global_work_size[3] = { num_elements,    1, 1 };
+    const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
+
+    int CL_rc;
+
+    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, cuda_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+  }
+
+  return 0;
+}
+
+int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num)
+{
+  device_param->kernel_params_amp_buf64[6] = num;
+
+  u64 num_elements = num;
+
+  const u64 kernel_threads = device_param->kernel_wgs_amp;
+
+  if (device_param->is_cuda == true)
+  {
+    num_elements = CEILDIV (num_elements, kernel_threads);
+
+    CUfunction cuda_function = device_param->cuda_function_amp;
+
+    const int rc_cuLaunchKernel = hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_amp, NULL);
+
+    if (rc_cuLaunchKernel == -1) return -1;
+
+    const int rc_cuStreamSynchronize = hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream);
+
+    if (rc_cuStreamSynchronize == -1) return -1;
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+
+    cl_kernel opencl_kernel = device_param->opencl_kernel_amp;
+
+    int CL_rc;
+
+    CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_ulong), device_param->kernel_params_amp[6]);
+
+    if (CL_rc == -1) return -1;
+
+    const size_t global_work_size[3] = { num_elements,    1, 1 };
+    const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
+
+    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+  }
+
+  return 0;
+}
+
+int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num)
+{
+  device_param->kernel_params_decompress_buf64[3] = num;
+
+  u64 num_elements = num;
+
+  const u64 kernel_threads = device_param->kernel_wgs_decompress;
+
+  if (device_param->is_cuda == true)
+  {
+    num_elements = CEILDIV (num_elements, kernel_threads);
+
+    CUfunction cuda_function = device_param->cuda_function_decompress;
+
+    const int rc_cuLaunchKernel = hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_decompress, NULL);
+
+    if (rc_cuLaunchKernel == -1) return -1;
+
+    const int rc_cuStreamSynchronize = hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream);
+
+    if (rc_cuStreamSynchronize == -1) return -1;
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+
+    cl_kernel opencl_kernel = device_param->opencl_kernel_decompress;
+
+    const size_t global_work_size[3] = { num_elements,    1, 1 };
+    const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
+
+    int CL_rc;
+
+    CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+  }
+
+  return 0;
+}
+
+int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt)
+{
+  combinator_ctx_t     *combinator_ctx      = hashcat_ctx->combinator_ctx;
+  hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
+  user_options_t       *user_options        = hashcat_ctx->user_options;
+  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
+
+  // init speed timer
+
+  #if defined (_WIN)
+  if (device_param->timer_speed.QuadPart == 0)
+  {
+    hc_timer_set (&device_param->timer_speed);
+  }
+  #else
+  if (device_param->timer_speed.tv_sec == 0)
+  {
+    hc_timer_set (&device_param->timer_speed);
+  }
+  #endif
+
+  if (user_options->slow_candidates == true)
+  {
+    if (device_param->is_cuda == true)
+    {
+      int CU_rc;
+
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t));
+
+      if (CU_rc == -1) return -1;
+
+      const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+
+      const u32 off = pw_idx->off;
+
+      if (off)
+      {
+        CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32));
+
+        if (CU_rc == -1) return -1;
+      }
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      int CL_rc;
+
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL);
+
+      if (CL_rc == -1) return -1;
+
+      const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+
+      const u32 off = pw_idx->off;
+
+      if (off)
+      {
+        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL);
+
+        if (CL_rc == -1) return -1;
+      }
+    }
+
+    const int rc_kernel_decompress = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt);
+
+    if (rc_kernel_decompress == -1) return -1;
+  }
+  else
+  {
+    if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+    {
+      if (device_param->is_cuda == true)
+      {
+        int CU_rc;
+
+        CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t));
+
+        if (CU_rc == -1) return -1;
+
+        const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+
+        const u32 off = pw_idx->off;
+
+        if (off)
+        {
+          CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32));
+
+          if (CU_rc == -1) return -1;
+        }
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        int CL_rc;
+
+        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+
+        const u32 off = pw_idx->off;
+
+        if (off)
+        {
+          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+        }
+      }
+
+      const int rc_kernel_decompress = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt);
+
+      if (rc_kernel_decompress == -1) return -1;
+    }
+    else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        if (user_options->attack_mode == ATTACK_MODE_COMBI)
+        {
+          if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_RIGHT)
+          {
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
+            {
+              rebuild_pws_compressed_append (device_param, pws_cnt, 0x01);
+            }
+            else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
+            {
+              rebuild_pws_compressed_append (device_param, pws_cnt, 0x06);
+            }
+            else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
+            {
+              rebuild_pws_compressed_append (device_param, pws_cnt, 0x80);
+            }
+          }
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+        {
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
+          {
+            rebuild_pws_compressed_append (device_param, pws_cnt, 0x01);
+          }
+          else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
+          {
+            rebuild_pws_compressed_append (device_param, pws_cnt, 0x06);
+          }
+          else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
+          {
+            rebuild_pws_compressed_append (device_param, pws_cnt, 0x80);
+          }
+        }
+
+        if (device_param->is_cuda == true)
+        {
+          int CU_rc;
+
+          CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t));
+
+          if (CU_rc == -1) return -1;
+
+          const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+
+          const u32 off = pw_idx->off;
+
+          if (off)
+          {
+            CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32));
+
+            if (CU_rc == -1) return -1;
+          }
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          int CL_rc;
+
+          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+
+          const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+
+          const u32 off = pw_idx->off;
+
+          if (off)
+          {
+            CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL);
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+
+        const int rc_kernel_decompress = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt);
+
+        if (rc_kernel_decompress == -1) return -1;
+      }
+      else
+      {
+        if (user_options->attack_mode == ATTACK_MODE_COMBI)
+        {
+          if (device_param->is_cuda == true)
+          {
+            int CU_rc;
+
+            CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t));
+
+            if (CU_rc == -1) return -1;
+
+            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+
+            const u32 off = pw_idx->off;
+
+            if (off)
+            {
+              CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32));
+
+              if (CU_rc == -1) return -1;
+            }
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            int CL_rc;
+
+            CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL);
+
+            if (CL_rc == -1) return -1;
+
+            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+
+            const u32 off = pw_idx->off;
+
+            if (off)
+            {
+              CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL);
+
+              if (CL_rc == -1) return -1;
+            }
+          }
+
+          const int rc_kernel_decompress = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt);
+
+          if (rc_kernel_decompress == -1) return -1;
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+        {
+          if (device_param->is_cuda == true)
+          {
+            int CU_rc;
+
+            CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t));
+
+            if (CU_rc == -1) return -1;
+
+            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+
+            const u32 off = pw_idx->off;
+
+            if (off)
+            {
+              CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32));
+
+              if (CU_rc == -1) return -1;
+            }
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            int CL_rc;
+
+            CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL);
+
+            if (CL_rc == -1) return -1;
+
+            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+
+            const u32 off = pw_idx->off;
+
+            if (off)
+            {
+              CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL);
+
+              if (CL_rc == -1) return -1;
+            }
+          }
+
+          const int rc_kernel_decompress = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt);
+
+          if (rc_kernel_decompress == -1) return -1;
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+        {
+          const u64 off = device_param->words_off;
+
+          device_param->kernel_params_mp_buf64[3] = off;
+
+          const int CL_rc = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, pws_cnt);
+
+          if (CL_rc == -1) return -1;
+        }
+      }
+    }
+    else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+    {
+      const u64 off = device_param->words_off;
+
+      device_param->kernel_params_mp_l_buf64[3] = off;
+
+      const int CL_rc = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP_L, pws_cnt);
+
+      if (CL_rc == -1) return -1;
+    }
+  }
+
+  return 0;
+}
+
+int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt)
+{
+  combinator_ctx_t      *combinator_ctx     = hashcat_ctx->combinator_ctx;
+  hashconfig_t          *hashconfig         = hashcat_ctx->hashconfig;
+  hashes_t              *hashes             = hashcat_ctx->hashes;
+  mask_ctx_t            *mask_ctx           = hashcat_ctx->mask_ctx;
+  status_ctx_t          *status_ctx         = hashcat_ctx->status_ctx;
+  straight_ctx_t        *straight_ctx       = hashcat_ctx->straight_ctx;
+  user_options_t        *user_options       = hashcat_ctx->user_options;
+  user_options_extra_t  *user_options_extra = hashcat_ctx->user_options_extra;
+
+  // do the on-the-fly combinator mode encoding
+
+  bool iconv_enabled = false;
+
+  iconv_t iconv_ctx = NULL;
+
+  char *iconv_tmp = NULL;
+
+  if (strcmp (user_options->encoding_from, user_options->encoding_to) != 0)
+  {
+    iconv_enabled = true;
+
+    iconv_ctx = iconv_open (user_options->encoding_to, user_options->encoding_from);
+
+    if (iconv_ctx == (iconv_t) -1) return -1;
+
+    iconv_tmp = (char *) hcmalloc (HCBUFSIZ_TINY);
+  }
+
+  // find higest password length, this is for optimization stuff
+
+  u32 highest_pw_len = 0;
+
+  if (user_options->slow_candidates == true)
+  {
+    /*
+    for (u64 pws_idx = 0; pws_idx < pws_cnt; pws_idx++)
+    {
+      pw_idx_t *pw_idx = device_param->pws_idx + pws_idx;
+
+      highest_pw_len = MAX (highest_pw_len, pw_idx->len);
+    }
+    */
+  }
+  else
+  {
+    if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+    {
+    }
+    else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+    {
+    }
+    else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+    {
+      highest_pw_len = device_param->kernel_params_mp_l_buf32[4]
+                     + device_param->kernel_params_mp_l_buf32[5];
+    }
+  }
+
+  // we make use of this in status view
+
+  device_param->outerloop_multi = 1;
+  device_param->outerloop_msec  = 0;
+  device_param->outerloop_pos   = 0;
+  device_param->outerloop_left  = pws_cnt;
+
+  // we ignore the time to copy data over pci bus in this case
+
+  if (user_options->speed_only == true)
+  {
+    hc_timer_set (&device_param->timer_speed);
+  }
+
+  // loop start: most outer loop = salt iteration, then innerloops (if multi)
+
+  for (u32 salt_pos = 0; salt_pos < hashes->salts_cnt; salt_pos++)
+  {
+    while (status_ctx->devices_status == STATUS_PAUSED) sleep (1);
+
+    salt_t *salt_buf = &hashes->salts_buf[salt_pos];
+
+    device_param->kernel_params_buf32[27] = salt_pos;
+    device_param->kernel_params_buf32[31] = salt_buf->digests_cnt;
+    device_param->kernel_params_buf32[32] = salt_buf->digests_offset;
+
+    FILE *combs_fp = device_param->combs_fp;
+
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      if ((user_options->attack_mode == ATTACK_MODE_COMBI) || (((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0) && (user_options->attack_mode == ATTACK_MODE_HYBRID2)))
+      {
+        rewind (combs_fp);
+      }
+    }
+
+    // iteration type
+
+    u32 innerloop_step = 0;
+    u32 innerloop_cnt  = 0;
+
+    if (user_options->slow_candidates == true)
+    {
+      innerloop_step = 1;
+      innerloop_cnt  = 1;
+    }
+    else
+    {
+      if   (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) innerloop_step = device_param->kernel_loops;
+      else                                                        innerloop_step = 1;
+
+      if      (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)  innerloop_cnt = (u32) straight_ctx->kernel_rules_cnt;
+      else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)     innerloop_cnt = (u32) combinator_ctx->combs_cnt;
+      else if (user_options_extra->attack_kern == ATTACK_KERN_BF)        innerloop_cnt = (u32) mask_ctx->bfs_cnt;
+    }
+
+    // innerloops
+
+    for (u32 innerloop_pos = 0; innerloop_pos < innerloop_cnt; innerloop_pos += innerloop_step)
+    {
+      while (status_ctx->devices_status == STATUS_PAUSED) sleep (1);
+
+      u32 fast_iteration = 0;
+
+      u32 innerloop_left = innerloop_cnt - innerloop_pos;
+
+      if (innerloop_left > innerloop_step)
+      {
+        innerloop_left = innerloop_step;
+
+        fast_iteration = 1;
+      }
+
+      hc_thread_mutex_lock (status_ctx->mux_display);
+
+      device_param->innerloop_pos  = innerloop_pos;
+      device_param->innerloop_left = innerloop_left;
+
+      device_param->kernel_params_buf32[30] = (u32) innerloop_left;
+
+      device_param->outerloop_multi = (double) innerloop_cnt / (double) (innerloop_pos + innerloop_left);
+
+      hc_thread_mutex_unlock (status_ctx->mux_display);
+
+      if (hashes->salts_shown[salt_pos] == 1)
+      {
+        status_ctx->words_progress_done[salt_pos] += (u64) pws_cnt * innerloop_left;
+
+        continue;
+      }
+
+      // initialize and copy amplifiers
+
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          if (device_param->is_cuda == true)
+          {
+            const int CU_rc = hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t));
+
+            if (CU_rc == -1) return -1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            const int CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, device_param->opencl_d_rules_c, innerloop_pos * sizeof (kernel_rule_t), 0, innerloop_left * sizeof (kernel_rule_t), 0, NULL, NULL);
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+          {
+            if (user_options->attack_mode == ATTACK_MODE_COMBI)
+            {
+              char *line_buf = device_param->scratch_buf;
+
+              u32 i = 0;
+
+              while (i < innerloop_left)
+              {
+                if (feof (combs_fp)) break;
+
+                size_t line_len = fgetl (combs_fp, line_buf);
+
+                line_len = convert_from_hex (hashcat_ctx, line_buf, line_len);
+
+                if (line_len > PW_MAX) continue;
+
+                char *line_buf_new = line_buf;
+
+                char rule_buf_out[RP_PASSWORD_SIZE];
+
+                if (run_rule_engine (user_options_extra->rule_len_r, user_options->rule_buf_r))
+                {
+                  if (line_len >= RP_PASSWORD_SIZE) continue;
+
+                  memset (rule_buf_out, 0, sizeof (rule_buf_out));
+
+                  const int rule_len_out = _old_apply_rule (user_options->rule_buf_r, user_options_extra->rule_len_r, line_buf, (u32) line_len, rule_buf_out);
+
+                  if (rule_len_out < 0)
+                  {
+                    status_ctx->words_progress_rejected[salt_pos] += pws_cnt;
+
+                    continue;
+                  }
+
+                  line_len = rule_len_out;
+
+                  line_buf_new = rule_buf_out;
+                }
+
+                // do the on-the-fly encoding
+
+                if (iconv_enabled == true)
+                {
+                  char  *iconv_ptr = iconv_tmp;
+                  size_t iconv_sz  = HCBUFSIZ_TINY;
+
+                  const size_t iconv_rc = iconv (iconv_ctx, &line_buf_new, &line_len, &iconv_ptr, &iconv_sz);
+
+                  if (iconv_rc == (size_t) -1) continue;
+
+                  line_buf_new = iconv_tmp;
+                  line_len     = HCBUFSIZ_TINY - iconv_sz;
+                }
+
+                line_len = MIN (line_len, PW_MAX);
+
+                u8 *ptr = (u8 *) device_param->combs_buf[i].i;
+
+                memcpy (ptr, line_buf_new, line_len);
+
+                memset (ptr + line_len, 0, PW_MAX - line_len);
+
+                if (hashconfig->opts_type & OPTS_TYPE_PT_UPPER)
+                {
+                  uppercase (ptr, line_len);
+                }
+
+                if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_LEFT)
+                {
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
+                  {
+                    ptr[line_len] = 0x80;
+                  }
+
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
+                  {
+                    ptr[line_len] = 0x06;
+                  }
+
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
+                  {
+                    ptr[line_len] = 0x01;
+                  }
+                }
+
+                device_param->combs_buf[i].pw_len = (u32) line_len;
+
+                i++;
+              }
+
+              for (u32 j = i; j < innerloop_left; j++)
+              {
+                memset (&device_param->combs_buf[j], 0, sizeof (pw_t));
+              }
+
+              innerloop_left = i;
+
+              if (device_param->is_cuda == true)
+              {
+                const int CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t));
+
+                if (CU_rc == -1) return -1;
+              }
+
+              if (device_param->is_opencl == true)
+              {
+                const int CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL);
+
+                if (CL_rc == -1) return -1;
+              }
+            }
+            else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+            {
+              u64 off = innerloop_pos;
+
+              device_param->kernel_params_mp_buf64[3] = off;
+
+              int CL_rc;
+
+              CL_rc = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left);
+
+              if (CL_rc == -1) return -1;
+
+              if (device_param->is_cuda == true)
+              {
+                const int CU_rc = hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t));
+
+                if (CU_rc == -1) return -1;
+              }
+
+              if (device_param->is_opencl == true)
+              {
+                CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs, device_param->opencl_d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL);
+
+                if (CL_rc == -1) return -1;
+              }
+            }
+            else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+            {
+              u64 off = innerloop_pos;
+
+              device_param->kernel_params_mp_buf64[3] = off;
+
+              const int rc_kernel_mp = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left);
+
+              if (rc_kernel_mp == -1) return -1;
+
+              if (device_param->is_cuda == true)
+              {
+                const int CU_rc = hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t));
+
+                if (CU_rc == -1) return -1;
+              }
+
+              if (device_param->is_opencl == true)
+              {
+                const int CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs, device_param->opencl_d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL);
+
+                if (CL_rc == -1) return -1;
+              }
+            }
+          }
+          else
+          {
+            if ((user_options->attack_mode == ATTACK_MODE_COMBI) || (user_options->attack_mode == ATTACK_MODE_HYBRID2))
+            {
+              char *line_buf = device_param->scratch_buf;
+
+              u32 i = 0;
+
+              while (i < innerloop_left)
+              {
+                if (feof (combs_fp)) break;
+
+                size_t line_len = fgetl (combs_fp, line_buf);
+
+                line_len = convert_from_hex (hashcat_ctx, line_buf, line_len);
+
+                if (line_len > PW_MAX) continue;
+
+                char *line_buf_new = line_buf;
+
+                char rule_buf_out[RP_PASSWORD_SIZE];
+
+                if (run_rule_engine (user_options_extra->rule_len_r, user_options->rule_buf_r))
+                {
+                  if (line_len >= RP_PASSWORD_SIZE) continue;
+
+                  memset (rule_buf_out, 0, sizeof (rule_buf_out));
+
+                  const int rule_len_out = _old_apply_rule (user_options->rule_buf_r, user_options_extra->rule_len_r, line_buf, (u32) line_len, rule_buf_out);
+
+                  if (rule_len_out < 0)
+                  {
+                    status_ctx->words_progress_rejected[salt_pos] += pws_cnt;
+
+                    continue;
+                  }
+
+                  line_len = rule_len_out;
+
+                  line_buf_new = rule_buf_out;
+                }
+
+                // do the on-the-fly encoding
+
+                if (iconv_enabled == true)
+                {
+                  char  *iconv_ptr = iconv_tmp;
+                  size_t iconv_sz  = HCBUFSIZ_TINY;
+
+                  const size_t iconv_rc = iconv (iconv_ctx, &line_buf_new, &line_len, &iconv_ptr, &iconv_sz);
+
+                  if (iconv_rc == (size_t) -1) continue;
+
+                  line_buf_new = iconv_tmp;
+                  line_len     = HCBUFSIZ_TINY - iconv_sz;
+                }
+
+                line_len = MIN (line_len, PW_MAX);
+
+                u8 *ptr = (u8 *) device_param->combs_buf[i].i;
+
+                memcpy (ptr, line_buf_new, line_len);
+
+                memset (ptr + line_len, 0, PW_MAX - line_len);
+
+                if (hashconfig->opts_type & OPTS_TYPE_PT_UPPER)
+                {
+                  uppercase (ptr, line_len);
+                }
+
+                /*
+                if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_LEFT)
+                {
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
+                  {
+                    ptr[line_len] = 0x80;
+                  }
+
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
+                  {
+                    ptr[line_len] = 0x06;
+                  }
+
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
+                  {
+                    ptr[line_len] = 0x01;
+                  }
+                }
+                */
+
+                device_param->combs_buf[i].pw_len = (u32) line_len;
+
+                i++;
+              }
+
+              for (u32 j = i; j < innerloop_left; j++)
+              {
+                memset (&device_param->combs_buf[j], 0, sizeof (pw_t));
+              }
+
+              innerloop_left = i;
+
+              if (device_param->is_cuda == true)
+              {
+                const int CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t));
+
+                if (CU_rc == -1) return -1;
+              }
+
+              if (device_param->is_opencl == true)
+              {
+                const int CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL);
+
+                if (CL_rc == -1) return -1;
+              }
+            }
+            else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+            {
+              u64 off = innerloop_pos;
+
+              device_param->kernel_params_mp_buf64[3] = off;
+
+              const int rc_kernel_mp = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left);
+
+              if (rc_kernel_mp == -1) return -1;
+
+              if (device_param->is_cuda == true)
+              {
+                const int CU_rc = hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t));
+
+                if (CU_rc == -1) return -1;
+              }
+
+              if (device_param->is_opencl == true)
+              {
+                const int CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs, device_param->opencl_d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL);
+
+                if (CL_rc == -1) return -1;
+              }
+            }
+          }
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          u64 off = innerloop_pos;
+
+          device_param->kernel_params_mp_r_buf64[3] = off;
+
+          const int rc_kernel_mp = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP_R, innerloop_left);
+
+          if (rc_kernel_mp == -1) return -1;
+
+          if (device_param->is_cuda == true)
+          {
+            const int CU_rc = hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_bfs, innerloop_left * sizeof (bf_t));
+
+            if (CU_rc == -1) return -1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            const int CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bfs, device_param->opencl_d_bfs_c, 0, 0, innerloop_left * sizeof (bf_t), 0, NULL, NULL);
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+      }
+
+      const int rc = choose_kernel (hashcat_ctx, device_param, highest_pw_len, pws_cnt, fast_iteration, salt_pos);
+
+      if (rc == -1) return -1;
+
+      /**
+       * benchmark was aborted because too long kernel runtime (slow hashes only)
+       */
+
+      if ((user_options->speed_only == true) && (device_param->speed_only_finish == true))
+      {
+        // nothing to do in that case
+      }
+      else
+      {
+        /**
+         * speed
+         */
+
+        if (status_ctx->run_thread_level2 == true)
+        {
+          const u64 perf_sum_all = (u64) pws_cnt * innerloop_left;
+
+          const double speed_msec = hc_timer_get (device_param->timer_speed);
+
+          hc_timer_set (&device_param->timer_speed);
+
+          u32 speed_pos = device_param->speed_pos;
+
+          device_param->speed_cnt[speed_pos] = perf_sum_all;
+
+          device_param->speed_msec[speed_pos] = speed_msec;
+
+          speed_pos++;
+
+          if (speed_pos == SPEED_CACHE)
+          {
+            speed_pos = 0;
+          }
+
+          device_param->speed_pos = speed_pos;
+
+          /**
+           * progress
+           */
+
+          hc_thread_mutex_lock (status_ctx->mux_counter);
+
+          status_ctx->words_progress_done[salt_pos] += perf_sum_all;
+
+          hc_thread_mutex_unlock (status_ctx->mux_counter);
+        }
+      }
+
+      /**
+       * benchmark, part2
+       */
+
+      if (user_options->speed_only == true)
+      {
+        // let's abort this so that the user doesn't have to wait too long on the result
+        // for slow hashes it's fine anyway as boost mode should be turned on
+
+        if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+        {
+          device_param->speed_only_finish = true;
+
+          break;
+        }
+        else
+        {
+          double total_msec = device_param->speed_msec[0];
+
+          for (u32 speed_pos = 1; speed_pos < device_param->speed_pos; speed_pos++)
+          {
+            total_msec += device_param->speed_msec[speed_pos];
+          }
+
+          if (user_options->slow_candidates == true)
+          {
+            if ((total_msec > 4000) || (device_param->speed_pos == SPEED_CACHE - 1))
+            {
+              const u32 speed_pos = device_param->speed_pos;
+
+              if (speed_pos)
+              {
+                device_param->speed_cnt[0]  = device_param->speed_cnt[speed_pos - 1];
+                device_param->speed_msec[0] = device_param->speed_msec[speed_pos - 1];
+              }
+
+              device_param->speed_pos = 0;
+
+              device_param->speed_only_finish = true;
+
+              break;
+            }
+          }
+          else
+          {
+            // it's unclear if 4s is enough to turn on boost mode for all backend device
+
+            if ((total_msec > 4000) || (device_param->speed_pos == SPEED_CACHE - 1))
+            {
+              device_param->speed_only_finish = true;
+
+              break;
+            }
+          }
+        }
+      }
+
+      if (device_param->speed_only_finish == true) break;
+
+      /**
+       * result
+       */
+
+      check_cracked (hashcat_ctx, device_param, salt_pos);
+
+      if (status_ctx->run_thread_level2 == false) break;
+    }
+
+    if (user_options->speed_only == true) break;
+
+    //status screen makes use of this, can't reset here
+    //device_param->innerloop_msec = 0;
+    //device_param->innerloop_pos  = 0;
+    //device_param->innerloop_left = 0;
+
+    if (status_ctx->run_thread_level2 == false) break;
+  }
+
+  //status screen makes use of this, can't reset here
+  //device_param->outerloop_msec = 0;
+  //device_param->outerloop_pos  = 0;
+  //device_param->outerloop_left = 0;
+
+  if (user_options->speed_only == true)
+  {
+    double total_msec = device_param->speed_msec[0];
+
+    for (u32 speed_pos = 1; speed_pos < device_param->speed_pos; speed_pos++)
+    {
+      total_msec += device_param->speed_msec[speed_pos];
+    }
+
+    device_param->outerloop_msec = total_msec * hashes->salts_cnt * device_param->outerloop_multi;
+
+    device_param->speed_only_finish = true;
+  }
+
+  return 0;
+}
+
+int backend_ctx_init (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
+  user_options_t *user_options = hashcat_ctx->user_options;
+
+  backend_ctx->enabled = false;
+
+  if (user_options->example_hashes == true) return 0;
+  if (user_options->keyspace       == true) return 0;
+  if (user_options->left           == true) return 0;
+  if (user_options->show           == true) return 0;
+  if (user_options->usage          == true) return 0;
+  if (user_options->version        == true) return 0;
+
+  hc_device_param_t *devices_param = (hc_device_param_t *) hccalloc (DEVICES_MAX, sizeof (hc_device_param_t));
+
+  backend_ctx->devices_param = devices_param;
+
+  /**
+   * Load and map CUDA library calls, then init CUDA
+   */
+
+  CUDA_PTR *cuda = (CUDA_PTR *) hcmalloc (sizeof (CUDA_PTR));
+
+  backend_ctx->cuda = cuda;
+
+  int rc_cuda_init = cuda_init (hashcat_ctx);
+
+  if (rc_cuda_init == -1)
+  {
+    cuda_close (hashcat_ctx);
+  }
+
+  /**
+   * Load and map NVRTC library calls
+   */
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) hcmalloc (sizeof (NVRTC_PTR));
+
+  backend_ctx->nvrtc = nvrtc;
+
+  int rc_nvrtc_init = nvrtc_init (hashcat_ctx);
+
+  if (rc_nvrtc_init == -1)
+  {
+    nvrtc_close (hashcat_ctx);
+  }
+
+  /**
+   * Check if both CUDA and NVRTC were load successful
+   */
+
+  if ((rc_cuda_init == 0) && (rc_nvrtc_init == 0))
+  {
+    // cuda version
+
+    int cuda_driver_version = 0;
+
+    const int rc_cuDriverGetVersion = hc_cuDriverGetVersion (hashcat_ctx, &cuda_driver_version);
+
+    if (rc_cuDriverGetVersion == -1) return -1;
+
+    backend_ctx->cuda_driver_version = cuda_driver_version;
+  }
+  else
+  {
+    rc_cuda_init  = -1;
+    rc_nvrtc_init = -1;
+
+    cuda_close  (hashcat_ctx);
+    nvrtc_close (hashcat_ctx);
+  }
+
+  /**
+   * Load and map OpenCL library calls
+   */
+
+  OCL_PTR *ocl = (OCL_PTR *) hcmalloc (sizeof (OCL_PTR));
+
+  backend_ctx->ocl = ocl;
+
+  const int rc_ocl_init = ocl_init (hashcat_ctx);
+
+  if (rc_ocl_init == -1)
+  {
+    ocl_close (hashcat_ctx);
+  }
+
+  /**
+   * return if both CUDA and OpenCL initialization failed
+   */
+
+  if ((rc_cuda_init == -1) && (rc_ocl_init == -1))
+  {
+    event_log_error (hashcat_ctx, "ATTENTION! No OpenCL or CUDA installation found.");
+
+    event_log_warning (hashcat_ctx, "You are probably missing the CUDA or OpenCL runtime installation.");
+    event_log_warning (hashcat_ctx, NULL);
+
+    #if defined (__linux__)
+    event_log_warning (hashcat_ctx, "* AMD GPUs on Linux require this driver:");
+    event_log_warning (hashcat_ctx, "  \"RadeonOpenCompute (ROCm)\" Software Platform (1.6.180 or later)");
+    #elif defined (_WIN)
+    event_log_warning (hashcat_ctx, "* AMD GPUs on Windows require this driver:");
+    event_log_warning (hashcat_ctx, "  \"AMD Radeon Software Crimson Edition\" (15.12 or later)");
+    #endif
+
+    event_log_warning (hashcat_ctx, "* Intel CPUs require this runtime:");
+    event_log_warning (hashcat_ctx, "  \"OpenCL Runtime for Intel Core and Intel Xeon Processors\" (16.1.1 or later)");
+
+    #if defined (__linux__)
+    event_log_warning (hashcat_ctx, "* Intel GPUs on Linux require this driver:");
+    event_log_warning (hashcat_ctx, "  \"OpenCL 2.0 GPU Driver Package for Linux\" (2.0 or later)");
+    #elif defined (_WIN)
+    event_log_warning (hashcat_ctx, "* Intel GPUs on Windows require this driver:");
+    event_log_warning (hashcat_ctx, "  \"OpenCL Driver for Intel Iris and Intel HD Graphics\"");
+    #endif
+
+    event_log_warning (hashcat_ctx, "* NVIDIA GPUs require this runtime and/or driver (both):");
+    event_log_warning (hashcat_ctx, "  \"NVIDIA Driver\" (418.56 or later)");
+    event_log_warning (hashcat_ctx, "  \"CUDA Toolkit\" (10.1 or later)");
+    event_log_warning (hashcat_ctx, NULL);
+
+    return -1;
+  }
+
+  /**
+   * Some permission pre-check, because AMDGPU-PRO Driver crashes if the user has no permission to do this
+   */
+
+  const int rc_ocl_check = ocl_check_dri (hashcat_ctx);
+
+  if (rc_ocl_check == -1) return -1;
+
+  /**
+   * Backend device selection
+   */
+
+  u64 backend_devices_filter;
+
+  const bool rc_backend_devices_filter = setup_backend_devices_filter (hashcat_ctx, user_options->backend_devices, &backend_devices_filter);
+
+  if (rc_backend_devices_filter == false) return -1;
+
+  backend_ctx->backend_devices_filter = backend_devices_filter;
+
+  /**
+   * OpenCL device type selection
+   */
+
+  cl_device_type opencl_device_types_filter;
+
+  const bool rc_opencl_device_types_filter = setup_opencl_device_types_filter (hashcat_ctx, user_options->opencl_device_types, &opencl_device_types_filter);
+
+  if (rc_opencl_device_types_filter == false) return -1;
+
+  backend_ctx->opencl_device_types_filter = opencl_device_types_filter;
+
+  /**
+   * CUDA API: init
+   */
+
+  if (backend_ctx->cuda)
+  {
+    const int rc_cuInit = hc_cuInit (hashcat_ctx, 0);
+
+    if (rc_cuInit == -1)
+    {
+      cuda_close (hashcat_ctx);
+    }
+  }
+
+  /**
+   * OpenCL API: init
+   */
+
+  if (backend_ctx->ocl)
+  {
+    #define FREE_OPENCL_CTX_ON_ERROR          \
+    {                                         \
+      hcfree (opencl_platforms);              \
+      hcfree (opencl_platforms_devices);      \
+      hcfree (opencl_platforms_devices_cnt);  \
+      hcfree (opencl_platforms_name);         \
+      hcfree (opencl_platforms_vendor);       \
+      hcfree (opencl_platforms_vendor_id);    \
+      hcfree (opencl_platforms_version);      \
+    }
+
+    cl_platform_id *opencl_platforms             = (cl_platform_id *) hccalloc (CL_PLATFORMS_MAX, sizeof (cl_platform_id));
+    cl_uint         opencl_platforms_cnt         = 0;
+    cl_device_id  **opencl_platforms_devices     = (cl_device_id **)  hccalloc (CL_PLATFORMS_MAX, sizeof (cl_device_id *));
+    cl_uint        *opencl_platforms_devices_cnt = (cl_uint *)        hccalloc (CL_PLATFORMS_MAX, sizeof (cl_uint));
+    char          **opencl_platforms_name        = (char **)          hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
+    char          **opencl_platforms_vendor      = (char **)          hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
+    cl_uint        *opencl_platforms_vendor_id   = (cl_uint *)        hccalloc (CL_PLATFORMS_MAX, sizeof (cl_uint));
+    char          **opencl_platforms_version     = (char **)          hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
+
+    int CL_rc = hc_clGetPlatformIDs (hashcat_ctx, CL_PLATFORMS_MAX, opencl_platforms, &opencl_platforms_cnt);
+
+    if (CL_rc == -1)
+    {
+      opencl_platforms_cnt = 0;
+
+      FREE_OPENCL_CTX_ON_ERROR;
+
+      ocl_close (hashcat_ctx);
+    }
+
+    if (opencl_platforms_cnt)
+    {
+      for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
+      {
+        cl_platform_id opencl_platform = opencl_platforms[opencl_platforms_idx];
+
+        size_t param_value_size = 0;
+
+        // platform vendor
+
+        int CL_rc;
+
+        CL_rc = hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VENDOR, 0, NULL, &param_value_size);
+
+        if (CL_rc == -1) return -1;
+
+        char *opencl_platform_vendor = (char *) hcmalloc (param_value_size);
+
+        CL_rc = hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VENDOR, param_value_size, opencl_platform_vendor, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        opencl_platforms_vendor[opencl_platforms_idx] = opencl_platform_vendor;
+
+        // platform name
+
+        CL_rc = hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_NAME, 0, NULL, &param_value_size);
+
+        if (CL_rc == -1) return -1;
+
+        char *opencl_platform_name = (char *) hcmalloc (param_value_size);
+
+        CL_rc = hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_NAME, param_value_size, opencl_platform_name, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        opencl_platforms_name[opencl_platforms_idx] = opencl_platform_name;
+
+        // platform version
+
+        CL_rc = hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VERSION, 0, NULL, &param_value_size);
+
+        if (CL_rc == -1) return -1;
+
+        char *opencl_platform_version = (char *) hcmalloc (param_value_size);
+
+        CL_rc = hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VERSION, param_value_size, opencl_platform_version, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        opencl_platforms_version[opencl_platforms_idx] = opencl_platform_version;
+
+        // find our own platform vendor because pocl and mesa are pushing original vendor_id through opencl
+        // this causes trouble with vendor id based macros
+        // we'll assign generic to those without special optimization available
+
+        cl_uint opencl_platform_vendor_id = 0;
+
+        if (strcmp (opencl_platform_vendor, CL_VENDOR_AMD1) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_AMD;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_AMD2) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_AMD;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_AMD_USE_INTEL) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_AMD_USE_INTEL;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_APPLE) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_APPLE;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_INTEL_BEIGNET) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_INTEL_BEIGNET;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_INTEL_SDK) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_INTEL_SDK;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_MESA) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_MESA;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_NV) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_NV;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_POCL) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_POCL;
+        }
+        else
+        {
+          opencl_platform_vendor_id = VENDOR_ID_GENERIC;
+        }
+
+        opencl_platforms_vendor_id[opencl_platforms_idx] = opencl_platform_vendor_id;
+
+        cl_device_id *opencl_platform_devices = (cl_device_id *) hccalloc (DEVICES_MAX, sizeof (cl_device_id));
+
+        cl_uint opencl_platform_devices_cnt = 0;
+
+        CL_rc = hc_clGetDeviceIDs (hashcat_ctx, opencl_platform, CL_DEVICE_TYPE_ALL, DEVICES_MAX, opencl_platform_devices, &opencl_platform_devices_cnt);
+
+        if (CL_rc == -1)
+        {
+          event_log_error (hashcat_ctx, "clGetDeviceIDs(): %s", val2cstr_cl (CL_rc));
+
+          return -1;
+        }
+
+        opencl_platforms_devices[opencl_platforms_idx] = opencl_platform_devices;
+
+        opencl_platforms_devices_cnt[opencl_platforms_idx] = opencl_platform_devices_cnt;
+      }
+
+      if (user_options->opencl_device_types == NULL)
+      {
+        /**
+         * OpenCL device types:
+         *   In case the user did not specify --opencl-device-types and the user runs hashcat in a system with only a CPU only he probably want to use that CPU.
+         */
+
+        cl_device_type opencl_device_types_all = 0;
+
+        for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
+        {
+          cl_device_id *opencl_platform_devices     = opencl_platforms_devices[opencl_platforms_idx];
+          cl_uint       opencl_platform_devices_cnt = opencl_platforms_devices_cnt[opencl_platforms_idx];
+
+          for (u32 opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++)
+          {
+            cl_device_id opencl_device = opencl_platform_devices[opencl_platform_devices_idx];
+
+            cl_device_type opencl_device_type;
+
+            CL_rc = hc_clGetDeviceInfo (hashcat_ctx, opencl_device, CL_DEVICE_TYPE, sizeof (opencl_device_type), &opencl_device_type, NULL);
+
+            if (CL_rc == -1)
+            {
+              FREE_OPENCL_CTX_ON_ERROR;
+
+              return -1;
+            }
+
+            opencl_device_types_all |= opencl_device_type;
+          }
+        }
+
+        // In such a case, automatically enable CPU device type support, since it's disabled by default.
+
+        if ((opencl_device_types_all & (CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR)) == 0)
+        {
+          opencl_device_types_filter |= CL_DEVICE_TYPE_CPU;
+        }
+
+        // In another case, when the user uses --stdout, using CPU devices is much faster to setup
+        // If we have a CPU device, force it to be used
+
+        if (user_options->stdout_flag == true)
+        {
+          if (opencl_device_types_all & CL_DEVICE_TYPE_CPU)
+          {
+            opencl_device_types_filter = CL_DEVICE_TYPE_CPU;
+          }
+        }
+
+        backend_ctx->opencl_device_types_filter = opencl_device_types_filter;
+      }
+    }
+
+    backend_ctx->opencl_platforms             = opencl_platforms;
+    backend_ctx->opencl_platforms_cnt         = opencl_platforms_cnt;
+    backend_ctx->opencl_platforms_devices     = opencl_platforms_devices;
+    backend_ctx->opencl_platforms_devices_cnt = opencl_platforms_devices_cnt;
+    backend_ctx->opencl_platforms_name        = opencl_platforms_name;
+    backend_ctx->opencl_platforms_vendor      = opencl_platforms_vendor;
+    backend_ctx->opencl_platforms_vendor_id   = opencl_platforms_vendor_id;
+    backend_ctx->opencl_platforms_version     = opencl_platforms_version;
+
+    #undef FREE_OPENCL_CTX_ON_ERROR
+  }
+
+  /**
+   * Final checks
+   */
+
+  if ((backend_ctx->cuda == NULL) && (backend_ctx->ocl == NULL))
+  {
+    event_log_error (hashcat_ctx, "ATTENTION! No OpenCL-compatible or CUDA-compatible platform found.");
+
+    event_log_warning (hashcat_ctx, "You are probably missing the OpenCL or CUDA runtime installation.");
+    event_log_warning (hashcat_ctx, NULL);
+
+    #if defined (__linux__)
+    event_log_warning (hashcat_ctx, "* AMD GPUs on Linux require this driver:");
+    event_log_warning (hashcat_ctx, "  \"RadeonOpenCompute (ROCm)\" Software Platform (1.6.180 or later)");
+    #elif defined (_WIN)
+    event_log_warning (hashcat_ctx, "* AMD GPUs on Windows require this driver:");
+    event_log_warning (hashcat_ctx, "  \"AMD Radeon Software Crimson Edition\" (15.12 or later)");
+    #endif
+
+    event_log_warning (hashcat_ctx, "* Intel CPUs require this runtime:");
+    event_log_warning (hashcat_ctx, "  \"OpenCL Runtime for Intel Core and Intel Xeon Processors\" (16.1.1 or later)");
+
+    #if defined (__linux__)
+    event_log_warning (hashcat_ctx, "* Intel GPUs on Linux require this driver:");
+    event_log_warning (hashcat_ctx, "  \"OpenCL 2.0 GPU Driver Package for Linux\" (2.0 or later)");
+    #elif defined (_WIN)
+    event_log_warning (hashcat_ctx, "* Intel GPUs on Windows require this driver:");
+    event_log_warning (hashcat_ctx, "  \"OpenCL Driver for Intel Iris and Intel HD Graphics\"");
+    #endif
+
+    event_log_warning (hashcat_ctx, "* NVIDIA GPUs require this runtime and/or driver (both):");
+    event_log_warning (hashcat_ctx, "  \"NVIDIA Driver\" (418.56 or later)");
+    event_log_warning (hashcat_ctx, "  \"CUDA Toolkit\" (10.1 or later)");
+    event_log_warning (hashcat_ctx, NULL);
+
+    return -1;
+  }
+
+  backend_ctx->enabled = true;
+
+  return 0;
+}
+
+void backend_ctx_destroy (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (backend_ctx->enabled == false) return;
+
+  hcfree (backend_ctx->devices_param);
+
+  if (backend_ctx->ocl)
+  {
+    hcfree (backend_ctx->opencl_platforms);
+    hcfree (backend_ctx->opencl_platforms_devices);
+    hcfree (backend_ctx->opencl_platforms_devices_cnt);
+    hcfree (backend_ctx->opencl_platforms_name);
+    hcfree (backend_ctx->opencl_platforms_vendor);
+    hcfree (backend_ctx->opencl_platforms_vendor_id);
+    hcfree (backend_ctx->opencl_platforms_version);
+  }
+
+  nvrtc_close (hashcat_ctx);
+  cuda_close  (hashcat_ctx);
+  ocl_close   (hashcat_ctx);
+
+  memset (backend_ctx, 0, sizeof (backend_ctx_t));
+}
+
+int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
+{
+  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
+  user_options_t *user_options = hashcat_ctx->user_options;
+
+  if (backend_ctx->enabled == false) return 0;
+
+  hc_device_param_t *devices_param = backend_ctx->devices_param;
+
+  bool need_adl     = false;
+  bool need_nvml    = false;
+  bool need_nvapi   = false;
+  bool need_sysfs   = false;
+
+  int backend_devices_idx = 0;
+
+  int cuda_devices_cnt    = 0;
+  int cuda_devices_active = 0;
+
+  int CL_rc;
+  int CU_rc;
+
+  if (backend_ctx->cuda)
+  {
+    // device count
+
+    const int rc_cuDeviceGetCount = hc_cuDeviceGetCount (hashcat_ctx, &cuda_devices_cnt);
+
+    if (rc_cuDeviceGetCount == -1)
+    {
+      cuda_close (hashcat_ctx);
+    }
+
+    backend_ctx->cuda_devices_cnt = cuda_devices_cnt;
+
+    // device specific
+
+    for (int cuda_devices_idx = 0; cuda_devices_idx < cuda_devices_cnt; cuda_devices_idx++, backend_devices_idx++)
+    {
+      const u32 device_id = backend_devices_idx;
+
+      hc_device_param_t *device_param = &devices_param[backend_devices_idx];
+
+      device_param->device_id = device_id;
+
+      backend_ctx->backend_device_from_cuda[cuda_devices_idx] = backend_devices_idx;
+
+      CUdevice cuda_device;
+
+      CU_rc = hc_cuDeviceGet (hashcat_ctx, &cuda_device, cuda_devices_idx);
+
+      if (CU_rc == -1) return -1;
+
+      device_param->cuda_device = cuda_device;
+
+      device_param->is_cuda = true;
+
+      // device_name
+
+      char *device_name = (char *) hcmalloc (HCBUFSIZ_TINY);
+
+      CU_rc = hc_cuDeviceGetName (hashcat_ctx, device_name, HCBUFSIZ_TINY, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      device_param->device_name = device_name;
+
+      hc_string_trim_leading (device_name);
+
+      hc_string_trim_trailing (device_name);
+
+      // device_processors
+
+      int device_processors = 0;
+
+      CU_rc = hc_cuDeviceGetAttribute (hashcat_ctx, &device_processors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      device_param->device_processors = device_processors;
+
+      // device_global_mem, device_maxmem_alloc, device_available_mem
+
+      size_t bytes = 0;
+
+      CU_rc = hc_cuDeviceTotalMem (hashcat_ctx, &bytes, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      device_param->device_global_mem = (u64) bytes;
+
+      device_param->device_maxmem_alloc = (u64) bytes;
+
+      device_param->device_available_mem = 0;
+
+      // warp size
+
+      int cuda_warp_size = 0;
+
+      CU_rc = hc_cuDeviceGetAttribute (hashcat_ctx, &cuda_warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      device_param->cuda_warp_size = cuda_warp_size;
+
+      // sm_minor, sm_major
+
+      int sm_major = 0;
+      int sm_minor = 0;
+
+      CU_rc = hc_cuDeviceGetAttribute (hashcat_ctx, &sm_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      CU_rc = hc_cuDeviceGetAttribute (hashcat_ctx, &sm_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      device_param->sm_major = sm_major;
+      device_param->sm_minor = sm_minor;
+
+      // device_maxworkgroup_size
+
+      int device_maxworkgroup_size = 0;
+
+      CU_rc = hc_cuDeviceGetAttribute (hashcat_ctx, &device_maxworkgroup_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      device_param->device_maxworkgroup_size = device_maxworkgroup_size;
+
+      // max_clock_frequency
+
+      int device_maxclock_frequency = 0;
+
+      CU_rc = hc_cuDeviceGetAttribute (hashcat_ctx, &device_maxclock_frequency, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      device_param->device_maxclock_frequency = device_maxclock_frequency / 1000;
+
+      // pcie_bus, pcie_device, pcie_function
+
+      int pci_bus_id_nv  = 0;
+      int pci_slot_id_nv = 0;
+
+      CU_rc = hc_cuDeviceGetAttribute (hashcat_ctx, &pci_bus_id_nv, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      CU_rc = hc_cuDeviceGetAttribute (hashcat_ctx, &pci_slot_id_nv, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      device_param->pcie_bus      = (u8) (pci_bus_id_nv);
+      device_param->pcie_device   = (u8) (pci_slot_id_nv >> 3);
+      device_param->pcie_function = (u8) (pci_slot_id_nv & 7);
+
+      // kernel_exec_timeout
+
+      int kernel_exec_timeout = 0;
+
+      CU_rc = hc_cuDeviceGetAttribute (hashcat_ctx, &kernel_exec_timeout, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      device_param->kernel_exec_timeout = kernel_exec_timeout;
+
+      // max_shared_memory_per_block
+
+      int max_shared_memory_per_block = 0;
+
+      CU_rc = hc_cuDeviceGetAttribute (hashcat_ctx, &max_shared_memory_per_block, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      if (max_shared_memory_per_block < 32768)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: This device's shared buffer size is too small.", device_id + 1);
+
+        device_param->skipped = true;
+      }
+
+      device_param->device_local_mem_size = max_shared_memory_per_block;
+
+      // device_max_constant_buffer_size
+
+      int device_max_constant_buffer_size = 0;
+
+      CU_rc = hc_cuDeviceGetAttribute (hashcat_ctx, &device_max_constant_buffer_size, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, cuda_device);
+
+      if (CU_rc == -1) return -1;
+
+      if (device_max_constant_buffer_size < 65536)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: This device's local mem size is too small.", device_id + 1);
+
+        device_param->skipped = true;
+      }
+
+      // some attributes have to be hardcoded because they are used for instance in the build options
+
+      device_param->device_local_mem_type     = CL_LOCAL;
+      device_param->opencl_device_type        = CL_DEVICE_TYPE_GPU;
+      device_param->opencl_device_vendor_id   = VENDOR_ID_NV;
+      device_param->opencl_platform_vendor_id = VENDOR_ID_NV;
+
+      // or in the cached kernel checksum
+
+      device_param->opencl_device_version     = "";
+      device_param->opencl_driver_version     = "";
+
+      // or just to make sure they are not NULL
+
+      device_param->opencl_device_vendor     = "";
+      device_param->opencl_device_c_version  = "";
+
+      // skipped
+
+      if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0)
+      {
+        device_param->skipped = true;
+      }
+
+      if ((backend_ctx->opencl_device_types_filter & CL_DEVICE_TYPE_GPU) == 0)
+      {
+        device_param->skipped = true;
+      }
+
+      if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
+      {
+        need_nvml = true;
+
+        #if defined (_WIN) || defined (__CYGWIN__)
+        need_nvapi = true;
+        #endif
+      }
+
+      // CPU burning loop damper
+      // Value is given as number between 0-100
+      // By default 8%
+      // in theory not needed with CUDA
+
+      device_param->spin_damp = (double) user_options->spin_damp / 100;
+
+      // common driver check
+
+      if (device_param->skipped == false)
+      {
+        if ((user_options->force == false) && (user_options->backend_info == false))
+        {
+          // CUDA does not support query nvidia driver version, therefore no driver checks here
+          // IF needed, could be retrieved using nvmlSystemGetDriverVersion()
+
+          if (device_param->sm_major < 5)
+          {
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: This hardware has outdated CUDA compute capability (%u.%u).", device_id + 1, device_param->sm_major, device_param->sm_minor);
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             For modern OpenCL performance, upgrade to hardware that supports");
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             CUDA compute capability version 5.0 (Maxwell) or higher.");
+          }
+
+          if (device_param->kernel_exec_timeout != 0)
+          {
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: WARNING! Kernel exec timeout is not disabled.", device_id + 1);
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             This may cause \"CL_OUT_OF_RESOURCES\" or related errors.");
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             To disable the timeout, see: https://hashcat.net/q/timeoutpatch");
+          }
+        }
+
+        /**
+         * activate device
+         */
+
+        cuda_devices_active++;
+      }
+
+      CUcontext cuda_context;
+
+      const int rc_cuCtxCreate = hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device);
+
+      if (rc_cuCtxCreate == -1) return -1;
+
+      const int rc_cuCtxSetCurrent = hc_cuCtxSetCurrent (hashcat_ctx, cuda_context);
+
+      if (rc_cuCtxSetCurrent == -1) return -1;
+
+      // bcrypt optimization?
+      //const int rc_cuCtxSetCacheConfig = hc_cuCtxSetCacheConfig (hashcat_ctx, CU_FUNC_CACHE_PREFER_SHARED);
+      //
+      //if (rc_cuCtxSetCacheConfig == -1) return -1;
+
+      const bool has_bfe = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");
+
+      device_param->has_bfe = has_bfe;
+
+      const bool has_lop3 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");
+
+      device_param->has_lop3 = has_lop3;
+
+      const bool has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");
+
+      device_param->has_mov64 = has_mov64;
+
+      const bool has_prmt = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");
+
+      device_param->has_prmt = has_prmt;
+
+      // device_available_mem
+
+      size_t free  = 0;
+      size_t total = 0;
+
+      const int rc_cuMemGetInfo = hc_cuMemGetInfo (hashcat_ctx, &free, &total);
+
+      if (rc_cuMemGetInfo == -1) return -1;
+
+      device_param->device_available_mem = (u64) free;
+
+      const int rc_cuCtxDestroy = hc_cuCtxDestroy (hashcat_ctx, cuda_context);
+
+      if (rc_cuCtxDestroy == -1) return -1;
+    }
+  }
+
+  backend_ctx->cuda_devices_cnt     = cuda_devices_cnt;
+  backend_ctx->cuda_devices_active  = cuda_devices_active;
+
+  int opencl_devices_cnt    = 0;
+  int opencl_devices_active = 0;
+
+  if (backend_ctx->ocl)
+  {
+    /**
+     * OpenCL devices: simply push all devices from all platforms into the same device array
+     */
+
+    cl_uint         opencl_platforms_cnt         = backend_ctx->opencl_platforms_cnt;
+    cl_device_id  **opencl_platforms_devices     = backend_ctx->opencl_platforms_devices;
+    cl_uint        *opencl_platforms_devices_cnt = backend_ctx->opencl_platforms_devices_cnt;
+    cl_uint        *opencl_platforms_vendor_id   = backend_ctx->opencl_platforms_vendor_id;
+
+    for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
+    {
+      cl_device_id   *opencl_platform_devices     = opencl_platforms_devices[opencl_platforms_idx];
+      cl_uint         opencl_platform_devices_cnt = opencl_platforms_devices_cnt[opencl_platforms_idx];
+      cl_uint         opencl_platform_vendor_id   = opencl_platforms_vendor_id[opencl_platforms_idx];
+
+      for (u32 opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++, backend_devices_idx++, opencl_devices_cnt++)
+      {
+        const u32 device_id = backend_devices_idx;
+
+        hc_device_param_t *device_param = &devices_param[device_id];
+
+        device_param->device_id = device_id;
+
+        backend_ctx->backend_device_from_opencl[opencl_devices_cnt] = backend_devices_idx;
+
+        backend_ctx->backend_device_from_opencl_platform[opencl_platforms_idx][opencl_platform_devices_idx] = backend_devices_idx;
+
+        device_param->opencl_platform_vendor_id = opencl_platform_vendor_id;
+
+        device_param->opencl_device = opencl_platform_devices[opencl_platform_devices_idx];
+
+        //device_param->opencl_platform = opencl_platform;
+
+        device_param->is_opencl = true;
+
+        size_t param_value_size = 0;
+
+        // opencl_device_type
+
+        cl_device_type opencl_device_type;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_TYPE, sizeof (opencl_device_type), &opencl_device_type, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        opencl_device_type &= ~CL_DEVICE_TYPE_DEFAULT;
+
+        device_param->opencl_device_type = opencl_device_type;
+
+        // device_name
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NAME, 0, NULL, &param_value_size);
+
+        if (CL_rc == -1) return -1;
+
+        char *device_name = (char *) hcmalloc (param_value_size);
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NAME, param_value_size, device_name, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->device_name = device_name;
+
+        hc_string_trim_leading (device_param->device_name);
+
+        hc_string_trim_trailing (device_param->device_name);
+
+        // device_vendor
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VENDOR, 0, NULL, &param_value_size);
+
+        if (CL_rc == -1) return -1;
+
+        char *opencl_device_vendor = (char *) hcmalloc (param_value_size);
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VENDOR, param_value_size, opencl_device_vendor, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->opencl_device_vendor = opencl_device_vendor;
+
+        cl_uint opencl_device_vendor_id = 0;
+
+        if (strcmp (opencl_device_vendor, CL_VENDOR_AMD1) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_AMD;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_AMD2) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_AMD;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_AMD_USE_INTEL) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_AMD_USE_INTEL;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_APPLE;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE_USE_AMD) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_AMD;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE_USE_NV) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_NV;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE_USE_INTEL) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_INTEL_SDK;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_INTEL_BEIGNET) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_INTEL_BEIGNET;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_INTEL_SDK) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_INTEL_SDK;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_MESA) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_MESA;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_NV) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_NV;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_POCL) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_POCL;
+        }
+        else
+        {
+          opencl_device_vendor_id = VENDOR_ID_GENERIC;
+        }
+
+        device_param->opencl_device_vendor_id = opencl_device_vendor_id;
+
+        // device_version
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VERSION, 0, NULL, &param_value_size);
+
+        if (CL_rc == -1) return -1;
+
+        char *opencl_device_version = (char *) hcmalloc (param_value_size);
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VERSION, param_value_size, opencl_device_version, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->opencl_device_version = opencl_device_version;
+
+        // opencl_device_c_version
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &param_value_size);
+
+        if (CL_rc == -1) return -1;
+
+        char *opencl_device_c_version = (char *) hcmalloc (param_value_size);
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_OPENCL_C_VERSION, param_value_size, opencl_device_c_version, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->opencl_device_c_version = opencl_device_c_version;
+
+        // max_compute_units
+
+        cl_uint device_processors;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (device_processors), &device_processors, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->device_processors = device_processors;
+
+        // device_global_mem
+
+        cl_ulong device_global_mem;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof (device_global_mem), &device_global_mem, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->device_global_mem = device_global_mem;
+
+        device_param->device_available_mem = 0;
+
+        // device_maxmem_alloc
+
+        cl_ulong device_maxmem_alloc;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof (device_maxmem_alloc), &device_maxmem_alloc, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->device_maxmem_alloc = device_maxmem_alloc;
+
+        // note we'll limit to 2gb, otherwise this causes all kinds of weird errors because of possible integer overflows in opencl runtimes
+        // testwise disabling that
+        //device_param->device_maxmem_alloc = MIN (device_maxmem_alloc, 0x7fffffff);
+
+        // max_work_group_size
+
+        size_t device_maxworkgroup_size;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (device_maxworkgroup_size), &device_maxworkgroup_size, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->device_maxworkgroup_size = device_maxworkgroup_size;
+
+        // max_clock_frequency
+
+        cl_uint device_maxclock_frequency;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof (device_maxclock_frequency), &device_maxclock_frequency, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->device_maxclock_frequency = device_maxclock_frequency;
+
+        // device_endian_little
+
+        cl_bool device_endian_little;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_ENDIAN_LITTLE, sizeof (device_endian_little), &device_endian_little, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        if (device_endian_little == CL_FALSE)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: This device is not little-endian.", device_id + 1);
+
+          device_param->skipped = true;
+        }
+
+        // device_available
+
+        cl_bool device_available;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_AVAILABLE, sizeof (device_available), &device_available, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        if (device_available == CL_FALSE)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: This device is not available.", device_id + 1);
+
+          device_param->skipped = true;
+        }
+
+        // device_compiler_available
+
+        cl_bool device_compiler_available;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_COMPILER_AVAILABLE, sizeof (device_compiler_available), &device_compiler_available, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        if (device_compiler_available == CL_FALSE)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: No compiler is available for this device.", device_id + 1);
+
+          device_param->skipped = true;
+        }
+
+        // device_execution_capabilities
+
+        cl_device_exec_capabilities device_execution_capabilities;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof (device_execution_capabilities), &device_execution_capabilities, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        if ((device_execution_capabilities & CL_EXEC_KERNEL) == 0)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: This device does not support executing kernels.", device_id + 1);
+
+          device_param->skipped = true;
+        }
+
+        // device_extensions
+
+        size_t device_extensions_size;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_EXTENSIONS, 0, NULL, &device_extensions_size);
+
+        if (CL_rc == -1) return -1;
+
+        char *device_extensions = hcmalloc (device_extensions_size + 1);
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_EXTENSIONS, device_extensions_size, device_extensions, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        if (strstr (device_extensions, "base_atomics") == 0)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: This device does not support base atomics.", device_id + 1);
+
+          device_param->skipped = true;
+        }
+
+        if (strstr (device_extensions, "byte_addressable_store") == 0)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: This device does not support byte-addressable store.", device_id + 1);
+
+          device_param->skipped = true;
+        }
+
+        hcfree (device_extensions);
+
+        // device_max_constant_buffer_size
+
+        cl_ulong device_max_constant_buffer_size;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof (device_max_constant_buffer_size), &device_max_constant_buffer_size, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        if (device_max_constant_buffer_size < 65536)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: This device's constant buffer size is too small.", device_id + 1);
+
+          device_param->skipped = true;
+        }
+
+        // device_local_mem_size
+
+        cl_ulong device_local_mem_size;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (device_local_mem_size), &device_local_mem_size, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        if (device_local_mem_size < 32768)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: This device's local mem size is too small.", device_id + 1);
+
+          device_param->skipped = true;
+        }
+
+        device_param->device_local_mem_size = device_local_mem_size;
+
+        // device_local_mem_type
+
+        cl_device_local_mem_type device_local_mem_type;
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof (device_local_mem_type), &device_local_mem_type, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->device_local_mem_type = device_local_mem_type;
+
+        // If there's both an Intel CPU and an AMD OpenCL runtime it's a tricky situation
+        // Both platforms support CPU device types and therefore both will try to use 100% of the physical resources
+        // This results in both utilizing it for 50%
+        // However, Intel has much better SIMD control over their own hardware
+        // It makes sense to give them full control over their own hardware
+
+        if (opencl_device_type & CL_DEVICE_TYPE_CPU)
+        {
+          if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_INTEL)
+          {
+            if (user_options->force == false)
+            {
+              if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Not a native Intel OpenCL runtime. Expect massive speed loss.", device_id + 1);
+              if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             You can use --force to override, but do not report related errors.");
+
+              device_param->skipped = true;
+            }
+          }
+        }
+
+        // Since some times we get reports from users about not working hashcat, dropping error messages like:
+        // CL_INVALID_COMMAND_QUEUE and CL_OUT_OF_RESOURCES
+        // Turns out that this is caused by Intel OpenCL runtime handling their GPU devices
+        // Disable such devices unless the user forces to use it
+
+        #if !defined (__APPLE__)
+        if (opencl_device_type & CL_DEVICE_TYPE_GPU)
+        {
+          if ((device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK) || (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_BEIGNET))
+          {
+            if (user_options->force == false)
+            {
+              if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Intel's OpenCL runtime (GPU only) is currently broken.", device_id + 1);
+              if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             We are waiting for updated OpenCL drivers from Intel.");
+              if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             You can use --force to override, but do not report related errors.");
+
+              device_param->skipped = true;
+            }
+          }
+        }
+        #endif // __APPLE__
+
+        // skipped
+
+        if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0)
+        {
+          device_param->skipped = true;
+        }
+
+        if ((backend_ctx->opencl_device_types_filter & (opencl_device_type)) == 0)
+        {
+          device_param->skipped = true;
+        }
+
+        // driver_version
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DRIVER_VERSION, 0, NULL, &param_value_size);
+
+        if (CL_rc == -1) return -1;
+
+        char *opencl_driver_version = (char *) hcmalloc (param_value_size);
+
+        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DRIVER_VERSION, param_value_size, opencl_driver_version, NULL);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->opencl_driver_version = opencl_driver_version;
+
+        // vendor specific
+
+        if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+        {
+          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
+          {
+            need_adl = true;
+
+            #if defined (__linux__)
+            need_sysfs = true;
+            #endif
+          }
+
+          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
+          {
+            need_nvml = true;
+
+            #if defined (_WIN) || defined (__CYGWIN__)
+            need_nvapi = true;
+            #endif
+          }
+        }
+
+        if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+        {
+          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
+          {
+            cl_device_topology_amd amdtopo;
+
+            CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_TOPOLOGY_AMD, sizeof (amdtopo), &amdtopo, NULL);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->pcie_bus      = amdtopo.pcie.bus;
+            device_param->pcie_device   = amdtopo.pcie.device;
+            device_param->pcie_function = amdtopo.pcie.function;
+          }
+
+          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
+          {
+            cl_uint pci_bus_id_nv;  // is cl_uint the right type for them??
+            cl_uint pci_slot_id_nv;
+
+            CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_PCI_BUS_ID_NV, sizeof (pci_bus_id_nv), &pci_bus_id_nv, NULL);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_PCI_SLOT_ID_NV, sizeof (pci_slot_id_nv), &pci_slot_id_nv, NULL);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->pcie_bus      = (u8) (pci_bus_id_nv);
+            device_param->pcie_device   = (u8) (pci_slot_id_nv >> 3);
+            device_param->pcie_function = (u8) (pci_slot_id_nv & 7);
+
+            int sm_minor = 0;
+            int sm_major = 0;
+
+            CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof (sm_minor), &sm_minor, NULL);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof (sm_major), &sm_major, NULL);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->sm_minor = sm_minor;
+            device_param->sm_major = sm_major;
+
+            cl_uint kernel_exec_timeout = 0;
+
+            CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof (kernel_exec_timeout), &kernel_exec_timeout, NULL);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->kernel_exec_timeout = kernel_exec_timeout;
+
+            // CPU burning loop damper
+            // Value is given as number between 0-100
+            // By default 8%
+
+            device_param->spin_damp = (double) user_options->spin_damp / 100;
+
+            // recommend CUDA
+
+            if ((backend_ctx->cuda == NULL) || (backend_ctx->nvrtc == NULL))
+            {
+              event_log_warning (hashcat_ctx, "* Device #%u: CUDA SDK Toolkit installation NOT detected.", device_id + 1);
+              event_log_warning (hashcat_ctx, "             CUDA SDK Toolkit installation required for proper device support and utilization");
+              event_log_warning (hashcat_ctx, "             Falling back to OpenCL Runtime");
+
+              event_log_warning (hashcat_ctx, NULL);
+            }
+          }
+        }
+
+        // common driver check
+
+        if (device_param->skipped == false)
+        {
+          if ((user_options->force == false) && (user_options->backend_info == false))
+          {
+            if (opencl_device_type & CL_DEVICE_TYPE_CPU)
+            {
+              if (device_param->opencl_platform_vendor_id == VENDOR_ID_INTEL_SDK)
+              {
+                bool intel_warn = false;
+
+                // Intel OpenCL runtime 18
+
+                int opencl_driver1 = 0;
+                int opencl_driver2 = 0;
+                int opencl_driver3 = 0;
+                int opencl_driver4 = 0;
+
+                const int res18 = sscanf (device_param->opencl_driver_version, "%u.%u.%u.%u", &opencl_driver1, &opencl_driver2, &opencl_driver3, &opencl_driver4);
+
+                if (res18 == 4)
+                {
+                  // so far all versions 18 are ok
+                }
+                else
+                {
+                  // Intel OpenCL runtime 16
+
+                  float opencl_version = 0;
+                  int   opencl_build   = 0;
+
+                  const int res16 = sscanf (device_param->opencl_device_version, "OpenCL %f (Build %d)", &opencl_version, &opencl_build);
+
+                  if (res16 == 2)
+                  {
+                    if (opencl_build < 25) intel_warn = true;
+                  }
+                }
+
+                if (intel_warn == true)
+                {
+                  event_log_error (hashcat_ctx, "* Device #%u: Outdated or broken Intel OpenCL runtime '%s' detected!", device_id + 1, device_param->opencl_driver_version);
+
+                  event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported NVIDIA driver.");
+                  event_log_warning (hashcat_ctx, "See hashcat.net for officially supported NVIDIA drivers.");
+                  event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
+                  event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
+                  event_log_warning (hashcat_ctx, NULL);
+
+                  return -1;
+                }
+              }
+            }
+            else if (opencl_device_type & CL_DEVICE_TYPE_GPU)
+            {
+              if (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)
+              {
+                bool amd_warn = true;
+
+                #if defined (__linux__)
+                // AMDGPU-PRO Driver 16.40 and higher
+                if (strtoul (device_param->opencl_driver_version, NULL, 10) >= 2117) amd_warn = false;
+                // AMDGPU-PRO Driver 16.50 is known to be broken
+                if (strtoul (device_param->opencl_driver_version, NULL, 10) == 2236) amd_warn = true;
+                // AMDGPU-PRO Driver 16.60 is known to be broken
+                if (strtoul (device_param->opencl_driver_version, NULL, 10) == 2264) amd_warn = true;
+                // AMDGPU-PRO Driver 17.10 is known to be broken
+                if (strtoul (device_param->opencl_driver_version, NULL, 10) == 2348) amd_warn = true;
+                // AMDGPU-PRO Driver 17.20 (2416) is fine, doesn't need check will match >= 2117
+                #elif defined (_WIN)
+                // AMD Radeon Software 14.9 and higher, should be updated to 15.12
+                if (strtoul (device_param->opencl_driver_version, NULL, 10) >= 1573) amd_warn = false;
+                #else
+                // we have no information about other os
+                if (amd_warn == true) amd_warn = false;
+                #endif
+
+                if (amd_warn == true)
+                {
+                  event_log_error (hashcat_ctx, "* Device #%u: Outdated or broken AMD driver '%s' detected!", device_id + 1, device_param->opencl_driver_version);
+
+                  event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported AMD driver.");
+                  event_log_warning (hashcat_ctx, "See hashcat.net for officially supported AMD drivers.");
+                  event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
+                  event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
+                  event_log_warning (hashcat_ctx, NULL);
+
+                  return -1;
+                }
+              }
+
+              if (device_param->opencl_platform_vendor_id == VENDOR_ID_NV)
+              {
+                int nv_warn = true;
+
+                int version_maj = 0;
+                int version_min = 0;
+
+                const int r = sscanf (device_param->opencl_driver_version, "%d.%d", &version_maj, &version_min);
+
+                if (r == 2)
+                {
+                  if (version_maj >= 367)
+                  {
+                    if (version_maj == 418)
+                    {
+                      // older 418.x versions are known to be broken.
+                      // for instance, NVIDIA-Linux-x86_64-418.43.run
+                      // run ./hashcat -b -m 2501 results in self-test fail
+
+                      if (version_min >= 56)
+                      {
+                        nv_warn = false;
+                      }
+                    }
+                    else
+                    {
+                      nv_warn = false;
+                    }
+                  }
+                }
+                else
+                {
+                  // unknown version scheme, probably new driver version
+
+                  nv_warn = false;
+                }
+
+                if (nv_warn == true)
+                {
+                  event_log_warning (hashcat_ctx, "* Device #%u: Outdated or broken NVIDIA driver '%s' detected!", device_id + 1, device_param->opencl_driver_version);
+                  event_log_warning (hashcat_ctx, NULL);
+
+                  event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported NVIDIA driver.");
+                  event_log_warning (hashcat_ctx, "See hashcat's homepage for officially supported NVIDIA drivers.");
+                  event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
+                  event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
+                  event_log_warning (hashcat_ctx, NULL);
+
+                  return -1;
+                }
+
+                if (device_param->sm_major < 5)
+                {
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: This hardware has outdated CUDA compute capability (%u.%u).", device_id + 1, device_param->sm_major, device_param->sm_minor);
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             For modern OpenCL performance, upgrade to hardware that supports");
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             CUDA compute capability version 5.0 (Maxwell) or higher.");
+                }
+
+                if (device_param->kernel_exec_timeout != 0)
+                {
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: WARNING! Kernel exec timeout is not disabled.", device_id + 1);
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             This may cause \"CL_OUT_OF_RESOURCES\" or related errors.");
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             To disable the timeout, see: https://hashcat.net/q/timeoutpatch");
+                }
+              }
+
+              if ((strstr (device_param->opencl_device_c_version, "beignet")) || (strstr (device_param->opencl_device_version, "beignet")))
+              {
+                event_log_error (hashcat_ctx, "* Device #%u: Intel beignet driver detected!", device_id + 1);
+
+                event_log_warning (hashcat_ctx, "The beignet driver has been marked as likely to fail kernel compilation.");
+                event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
+                event_log_warning (hashcat_ctx, NULL);
+
+                return -1;
+              }
+            }
+          }
+
+          /**
+           * activate device
+           */
+
+          opencl_devices_active++;
+        }
+
+        /**
+         * create context for each device
+         */
+
+        cl_context context;
+
+        /*
+        cl_context_properties properties[3];
+
+        properties[0] = CL_CONTEXT_PLATFORM;
+        properties[1] = (cl_context_properties) device_param->opencl_platform;
+        properties[2] = 0;
+
+        CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context);
+        */
+
+        CL_rc = hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context);
+
+        if (CL_rc == -1) return -1;
+
+        /**
+         * create command-queue
+         */
+
+        cl_command_queue command_queue;
+
+        CL_rc = hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue);
+
+        if (CL_rc == -1) return -1;
+
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))
+        {
+          const bool has_vadd3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_ADD3_U32 %0, 0, 0, 0;\" : \"=v\"(r)); }");
+
+          device_param->has_vadd3 = has_vadd3;
+
+          const bool has_vbfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_BFE_U32 %0, 0, 0, 0;\" : \"=v\"(r)); }");
+
+          device_param->has_vbfe = has_vbfe;
+
+          const bool has_vperm = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_PERM_B32 %0, 0, 0, 0;\" : \"=v\"(r)); }");
+
+          device_param->has_vperm = has_vperm;
+        }
+
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
+        {
+          const bool has_bfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");
+
+          device_param->has_bfe = has_bfe;
+
+          const bool has_lop3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");
+
+          device_param->has_lop3 = has_lop3;
+
+          const bool has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");
+
+          device_param->has_mov64 = has_mov64;
+
+          const bool has_prmt = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");
+
+          device_param->has_prmt = has_prmt;
+        }
+
+        // device_available_mem
+
+        #define MAX_ALLOC_CHECKS_CNT  8192
+        #define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024)
+
+        device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE;
+
+        #if defined (_WIN)
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
+        #else
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) || (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)))
+        #endif
+        {
+          // OK, so the problem here is the following:
+          // There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device,
+          // but there's no way to ask for available memory on the device.
+          // In combination, most OpenCL runtimes implementation of clCreateBuffer()
+          // are doing so called lazy memory allocation on the device.
+          // Now, if the user has X11 (or a game or anything that takes a lot of GPU memory)
+          // running on the host we end up with an error type of this:
+          // clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE
+          // The clEnqueueNDRangeKernel() is because of the lazy allocation
+          // The best way to workaround this problem is if we would be able to ask for available memory,
+          // The idea here is to try to evaluate available memory by allocating it till it errors
+
+          cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem));
+
+          u64 c;
+
+          for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
+          {
+            if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
+
+            cl_int CL_err;
+
+            OCL_PTR *ocl = backend_ctx->ocl;
+
+            tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
+
+            if (CL_err != CL_SUCCESS)
+            {
+              c--;
+
+              break;
+            }
+
+            // transfer only a few byte should be enough to force the runtime to actually allocate the memory
+
+            u8 tmp_host[8];
+
+            CL_err = ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL);
+
+            if (CL_err != CL_SUCCESS) break;
+
+            CL_err = ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL);
+
+            if (CL_err != CL_SUCCESS) break;
+
+            CL_err = ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL);
+
+            if (CL_err != CL_SUCCESS) break;
+
+            CL_err = ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL);
+
+            if (CL_err != CL_SUCCESS) break;
+          }
+
+          device_param->device_available_mem = c * MAX_ALLOC_CHECKS_SIZE;
+
+          // clean up
+
+          for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
+          {
+            if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
+
+            if (tmp_device[c] != NULL)
+            {
+              CL_rc = hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]);
+
+              if (CL_rc == -1) return -1;
+            }
+          }
+
+          hcfree (tmp_device);
+        }
+
+        hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
+
+        hc_clReleaseContext (hashcat_ctx, context);
+      }
+    }
+  }
+
+  backend_ctx->opencl_devices_cnt     = opencl_devices_cnt;
+  backend_ctx->opencl_devices_active  = opencl_devices_active;
+
+  // all devices combined go into backend_* variables
+
+  backend_ctx->backend_devices_cnt    = cuda_devices_cnt    + opencl_devices_cnt;
+  backend_ctx->backend_devices_active = cuda_devices_active + opencl_devices_active;
+
+  // find duplicate devices (typically cuda and opencl!)
+  // using force here enables both devices, which is the worst possible outcome
+  // many users force by default, so this is not a good idea
+
+  //if (user_options->force == false)
+  //{
+  backend_ctx_find_alias_devices (hashcat_ctx);
+  //}
+
+  if (backend_ctx->backend_devices_active == 0)
+  {
+    event_log_error (hashcat_ctx, "No devices found/left.");
+
+    return -1;
+  }
+
+  // additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt)
+
+  if (backend_ctx->backend_devices_filter != (u64) -1)
+  {
+    const u64 backend_devices_cnt_mask = ~(((u64) -1 >> backend_ctx->backend_devices_cnt) << backend_ctx->backend_devices_cnt);
+
+    if (backend_ctx->backend_devices_filter > backend_devices_cnt_mask)
+    {
+      event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter.");
+      event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt);
+
+      return -1;
+    }
+  }
+
+  backend_ctx->target_msec  = TARGET_MSEC_PROFILE[user_options->workload_profile - 1];
+
+  backend_ctx->need_adl     = need_adl;
+  backend_ctx->need_nvml    = need_nvml;
+  backend_ctx->need_nvapi   = need_nvapi;
+  backend_ctx->need_sysfs   = need_sysfs;
+
+  backend_ctx->comptime     = comptime;
+
+  return 0;
+}
+
+void backend_ctx_devices_destroy (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (backend_ctx->enabled == false) return;
+
+  for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < backend_ctx->opencl_platforms_cnt; opencl_platforms_idx++)
+  {
+    hcfree (backend_ctx->opencl_platforms_devices[opencl_platforms_idx]);
+    hcfree (backend_ctx->opencl_platforms_name[opencl_platforms_idx]);
+    hcfree (backend_ctx->opencl_platforms_vendor[opencl_platforms_idx]);
+    hcfree (backend_ctx->opencl_platforms_version[opencl_platforms_idx]);
+  }
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    hcfree (device_param->device_name);
+
+    if (device_param->is_opencl == true)
+    {
+      hcfree (device_param->opencl_driver_version);
+      hcfree (device_param->opencl_device_version);
+      hcfree (device_param->opencl_device_c_version);
+      hcfree (device_param->opencl_device_vendor);
+    }
+  }
+
+  backend_ctx->backend_devices_cnt    = 0;
+  backend_ctx->backend_devices_active = 0;
+  backend_ctx->cuda_devices_cnt       = 0;
+  backend_ctx->cuda_devices_active    = 0;
+  backend_ctx->opencl_devices_cnt     = 0;
+  backend_ctx->opencl_devices_active  = 0;
+
+  backend_ctx->need_adl    = false;
+  backend_ctx->need_nvml   = false;
+  backend_ctx->need_nvapi  = false;
+  backend_ctx->need_sysfs  = false;
+}
+
+void backend_ctx_devices_sync_tuning (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (backend_ctx->enabled == false) return;
+
+  for (int backend_devices_cnt_src = 0; backend_devices_cnt_src < backend_ctx->backend_devices_cnt; backend_devices_cnt_src++)
+  {
+    hc_device_param_t *device_param_src = &backend_ctx->devices_param[backend_devices_cnt_src];
+
+    if (device_param_src->skipped == true) continue;
+
+    if (device_param_src->skipped_warning == true) continue;
+
+    for (int backend_devices_cnt_dst = backend_devices_cnt_src + 1; backend_devices_cnt_dst < backend_ctx->backend_devices_cnt; backend_devices_cnt_dst++)
+    {
+      hc_device_param_t *device_param_dst = &backend_ctx->devices_param[backend_devices_cnt_dst];
+
+      if (device_param_dst->skipped == true) continue;
+
+      if (device_param_dst->skipped_warning == true) continue;
+
+      if (is_same_device_type (device_param_src, device_param_dst) == false) continue;
+
+      device_param_dst->kernel_accel   = device_param_src->kernel_accel;
+      device_param_dst->kernel_loops   = device_param_src->kernel_loops;
+      device_param_dst->kernel_threads = device_param_src->kernel_threads;
+
+      const u32 hardware_power = device_param_dst->device_processors * device_param_dst->kernel_threads;
+
+      device_param_dst->hardware_power = hardware_power;
+
+      const u32 kernel_power = device_param_dst->hardware_power * device_param_dst->kernel_accel;
+
+      device_param_dst->kernel_power = kernel_power;
+    }
+  }
+}
+
+void backend_ctx_devices_update_power (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t        *backend_ctx         = hashcat_ctx->backend_ctx;
+  status_ctx_t         *status_ctx          = hashcat_ctx->status_ctx;
+  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
+  user_options_t       *user_options        = hashcat_ctx->user_options;
+
+  if (backend_ctx->enabled == false) return;
+
+  u32 kernel_power_all = 0;
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    if (device_param->skipped_warning == true) continue;
+
+    kernel_power_all += device_param->kernel_power;
+  }
+
+  backend_ctx->kernel_power_all = kernel_power_all;
+
+  /*
+   * Inform user about possible slow speeds
+   */
+
+  if ((user_options_extra->wordlist_mode == WL_MODE_FILE) || (user_options_extra->wordlist_mode == WL_MODE_MASK))
+  {
+    if (status_ctx->words_base < kernel_power_all)
+    {
+      if (user_options->quiet == false)
+      {
+        event_log_advice (hashcat_ctx, "The wordlist or mask that you are using is too small.");
+        event_log_advice (hashcat_ctx, "This means that hashcat cannot use the full parallel power of your device(s).");
+        event_log_advice (hashcat_ctx, "Unless you supply more work, your cracking speed will drop.");
+        event_log_advice (hashcat_ctx, "For tips on supplying more work, see: https://hashcat.net/faq/morework");
+        event_log_advice (hashcat_ctx, NULL);
+      }
+    }
+  }
+}
+
+void backend_ctx_devices_kernel_loops (hashcat_ctx_t *hashcat_ctx)
+{
+  combinator_ctx_t     *combinator_ctx      = hashcat_ctx->combinator_ctx;
+  hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
+  hashes_t             *hashes              = hashcat_ctx->hashes;
+  mask_ctx_t           *mask_ctx            = hashcat_ctx->mask_ctx;
+  backend_ctx_t        *backend_ctx         = hashcat_ctx->backend_ctx;
+  straight_ctx_t       *straight_ctx        = hashcat_ctx->straight_ctx;
+  user_options_t       *user_options        = hashcat_ctx->user_options;
+  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
+
+  if (backend_ctx->enabled == false) return;
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    if (device_param->skipped_warning == true) continue;
+
+    device_param->kernel_loops_min = device_param->kernel_loops_min_sav;
+    device_param->kernel_loops_max = device_param->kernel_loops_max_sav;
+
+    if (device_param->kernel_loops_min < device_param->kernel_loops_max)
+    {
+      u32 innerloop_cnt = 0;
+
+      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+      {
+        if (user_options->slow_candidates == true)
+        {
+          innerloop_cnt = 1;
+        }
+        else
+        {
+          if      (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)  innerloop_cnt = MIN (KERNEL_RULES, (u32) straight_ctx->kernel_rules_cnt);
+          else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)     innerloop_cnt = MIN (KERNEL_COMBS, (u32) combinator_ctx->combs_cnt);
+          else if (user_options_extra->attack_kern == ATTACK_KERN_BF)        innerloop_cnt = MIN (KERNEL_BFS,   (u32) mask_ctx->bfs_cnt);
+        }
+      }
+      else
+      {
+        innerloop_cnt = hashes->salts_buf[0].salt_iter;
+      }
+
+      if ((innerloop_cnt >= device_param->kernel_loops_min) &&
+          (innerloop_cnt <= device_param->kernel_loops_max))
+      {
+        device_param->kernel_loops_max = innerloop_cnt;
+      }
+    }
+  }
+}
+
+static int get_cuda_kernel_wgs (hashcat_ctx_t *hashcat_ctx, CUfunction function, u32 *result)
+{
+  int max_threads_per_block;
+
+  const int rc_cuFuncGetAttribute = hc_cuFuncGetAttribute (hashcat_ctx, &max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
+
+  if (rc_cuFuncGetAttribute == -1) return -1;
+
+  *result = (u32) max_threads_per_block;
+
+  return 0;
+}
+
+static int get_cuda_kernel_local_mem_size (hashcat_ctx_t *hashcat_ctx, CUfunction function, u64 *result)
+{
+  int shared_size_bytes;
+
+  const int rc_cuFuncGetAttribute = hc_cuFuncGetAttribute (hashcat_ctx, &shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, function);
+
+  if (rc_cuFuncGetAttribute == -1) return -1;
+
+  *result = (u64) shared_size_bytes;
+
+  return 0;
+}
+
+static int get_opencl_kernel_wgs (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u32 *result)
+{
+  int CL_rc;
+
+  size_t work_group_size;
+
+  CL_rc = hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (work_group_size), &work_group_size, NULL);
+
+  if (CL_rc == -1) return -1;
+
+  u32 kernel_threads = (u32) work_group_size;
+
+  size_t compile_work_group_size[3];
+
+  CL_rc = hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof (compile_work_group_size), &compile_work_group_size, NULL);
+
+  if (CL_rc == -1) return -1;
+
+  const size_t cwgs_total = compile_work_group_size[0] * compile_work_group_size[1] * compile_work_group_size[2];
+
+  if (cwgs_total > 0)
+  {
+    kernel_threads = MIN (kernel_threads, (u32) cwgs_total);
+  }
+
+  *result = kernel_threads;
+
+  return 0;
+}
+
+static int get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u32 *result)
+{
+  int CL_rc;
+
+  size_t preferred_work_group_size_multiple;
+
+  CL_rc = hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof (preferred_work_group_size_multiple), &preferred_work_group_size_multiple, NULL);
+
+  if (CL_rc == -1) return -1;
+
+  *result = (u32) preferred_work_group_size_multiple;
+
+  return 0;
+}
+
+static int get_opencl_kernel_local_mem_size (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u64 *result)
+{
+  int CL_rc;
+
+  cl_ulong local_mem_size;
+
+  CL_rc = hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof (local_mem_size), &local_mem_size, NULL);
+
+  if (CL_rc == -1) return -1;
+
+  *result = local_mem_size;
+
+  return 0;
+}
+
+static u32 get_kernel_threads (const hc_device_param_t *device_param)
+{
+  // a module can force a fixed value
+
+  u32 kernel_threads_min = device_param->kernel_threads_min;
+  u32 kernel_threads_max = device_param->kernel_threads_max;
+
+  // for CPU we just do 1 ...
+
+  if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+  {
+    if ((1 >= kernel_threads_min) && (1 <= kernel_threads_max))
+    {
+      kernel_threads_min = 1;
+      kernel_threads_max = 1;
+    }
+  }
+
+  // this is an upper limit, a good start, since our strategy is to reduce thread counts only
+
+  const u32 device_maxworkgroup_size = (u32) device_param->device_maxworkgroup_size;
+
+  if (device_maxworkgroup_size < kernel_threads_max)
+  {
+    kernel_threads_max = device_maxworkgroup_size;
+  }
+
+  u32 kernel_threads = kernel_threads_max;
+
+  // complicated kernel tend to confuse OpenCL runtime suggestions for maximum thread size
+  // let's workaround that by sticking to their device specific preferred thread size
+  // this section was replaced by autotune
+
+  /*
+  if (hashconfig->opts_type & OPTS_TYPE_PREFERED_THREAD)
+  {
+    if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        if (device_param->kernel_preferred_wgs_multiple1)
+        {
+          const u32 kernel_preferred_wgs_multiple1 = device_param->kernel_preferred_wgs_multiple1;
+
+          if ((kernel_preferred_wgs_multiple1 >= kernel_threads_min) && (kernel_preferred_wgs_multiple1 <= kernel_threads_max))
+          {
+            kernel_threads = kernel_preferred_wgs_multiple1;
+          }
+        }
+      }
+      else
+      {
+        if (device_param->kernel_preferred_wgs_multiple4)
+        {
+          const u32 kernel_preferred_wgs_multiple4 = device_param->kernel_preferred_wgs_multiple4;
+
+          if ((kernel_preferred_wgs_multiple4 >= kernel_threads_min) && (kernel_preferred_wgs_multiple4 <= kernel_threads_max))
+          {
+            kernel_threads = kernel_preferred_wgs_multiple4;
+          }
+        }
+      }
+    }
+    else
+    {
+      if (device_param->kernel_preferred_wgs_multiple2)
+      {
+        const u32 kernel_preferred_wgs_multiple2 = device_param->kernel_preferred_wgs_multiple2;
+
+        if ((kernel_preferred_wgs_multiple2 >= kernel_threads_min) && (kernel_preferred_wgs_multiple2 <= kernel_threads_max))
+        {
+          kernel_threads = kernel_preferred_wgs_multiple2;
+        }
+      }
+    }
+  }
+  else
+  {
+    if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        if (device_param->kernel_preferred_wgs_multiple1)
+        {
+          const u32 kernel_preferred_wgs_multiple1 = device_param->kernel_preferred_wgs_multiple1;
+
+          if ((kernel_preferred_wgs_multiple1 >= kernel_threads_min) && (kernel_preferred_wgs_multiple1 <= kernel_threads_max))
+          {
+            kernel_threads = kernel_preferred_wgs_multiple1;
+          }
+        }
+      }
+      else
+      {
+        if (device_param->kernel_preferred_wgs_multiple4)
+        {
+          const u32 kernel_preferred_wgs_multiple4 = device_param->kernel_preferred_wgs_multiple4;
+
+          if ((kernel_preferred_wgs_multiple4 >= kernel_threads_min) && (kernel_preferred_wgs_multiple4 <= kernel_threads_max))
+          {
+            kernel_threads = kernel_preferred_wgs_multiple4;
+          }
+        }
+      }
+    }
+    else
+    {
+      if (device_param->kernel_preferred_wgs_multiple2)
+      {
+        const u32 kernel_preferred_wgs_multiple2 = device_param->kernel_preferred_wgs_multiple2;
+
+        if ((kernel_preferred_wgs_multiple2 >= kernel_threads_min) && (kernel_preferred_wgs_multiple2 <= kernel_threads_max))
+        {
+          kernel_threads = kernel_preferred_wgs_multiple2;
+        }
+      }
+    }
+  }
+  */
+
+  return kernel_threads;
+}
+
+int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
+{
+  const bitmap_ctx_t         *bitmap_ctx          = hashcat_ctx->bitmap_ctx;
+  const folder_config_t      *folder_config       = hashcat_ctx->folder_config;
+  const hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
+  const hashes_t             *hashes              = hashcat_ctx->hashes;
+  const module_ctx_t         *module_ctx          = hashcat_ctx->module_ctx;
+        backend_ctx_t        *backend_ctx         = hashcat_ctx->backend_ctx;
+  const straight_ctx_t       *straight_ctx        = hashcat_ctx->straight_ctx;
+  const user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
+  const user_options_t       *user_options        = hashcat_ctx->user_options;
+
+  if (backend_ctx->enabled == false) return 0;
+
+  u64 size_total_host_all = 0;
+
+  u32 hardware_power_all = 0;
+
+  int CU_rc;
+  int CL_rc;
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    /**
+     * host buffer
+     */
+
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    EVENT_DATA (EVENT_BACKEND_DEVICE_INIT_PRE, &backend_devices_idx, sizeof (int));
+
+    const int device_id = device_param->device_id;
+
+    /**
+     * module depending checks
+     */
+
+    device_param->skipped_warning = false;
+
+    if (module_ctx->module_unstable_warning != MODULE_DEFAULT)
+    {
+      const bool unstable_warning = module_ctx->module_unstable_warning (hashconfig, user_options, user_options_extra, device_param);
+
+      if ((unstable_warning == true) && (user_options->force == false))
+      {
+        event_log_warning (hashcat_ctx, "* Device #%u: Skipping hash-mode %u - known CUDA/OpenCL Runtime/Driver issue (not a hashcat issue)", device_id + 1, hashconfig->hash_mode);
+        event_log_warning (hashcat_ctx, "             You can use --force to override, but do not report related errors.");
+
+        device_param->skipped_warning = true;
+
+        continue;
+      }
+    }
+
+    // vector_width
+
+    int vector_width;
+
+    if (user_options->backend_vector_width_chgd == false)
+    {
+      // tuning db
+
+      tuning_db_entry_t *tuningdb_entry;
+
+      if (user_options->slow_candidates == true)
+      {
+        tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, 0, hashconfig->hash_mode);
+      }
+      else
+      {
+        tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
+      }
+
+      if (tuningdb_entry == NULL || tuningdb_entry->vector_width == -1)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_USES_BITS_64)
+        {
+          if (device_param->is_cuda == true)
+          {
+            // cuda does not support this query
+
+            vector_width = 1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, sizeof (vector_width), &vector_width, NULL);
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+        else
+        {
+          if (device_param->is_cuda == true)
+          {
+            // cuda does not support this query
+
+            vector_width = 1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT,  sizeof (vector_width), &vector_width, NULL);
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+      }
+      else
+      {
+        vector_width = (cl_uint) tuningdb_entry->vector_width;
+      }
+    }
+    else
+    {
+      vector_width = user_options->backend_vector_width;
+    }
+
+    // We can't have SIMD in kernels where we have an unknown final password length
+    // It also turns out that pure kernels (that have a higher register pressure)
+    // actually run faster on scalar GPU (like 1080) without SIMD
+
+    if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
+    {
+      if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+      {
+        vector_width = 1;
+      }
+    }
+
+    if (vector_width > 16) vector_width = 16;
+
+    device_param->vector_width = vector_width;
+
+    /**
+     * kernel accel and loops tuning db adjustment
+     */
+
+    device_param->kernel_accel_min   = hashconfig->kernel_accel_min;
+    device_param->kernel_accel_max   = hashconfig->kernel_accel_max;
+    device_param->kernel_loops_min   = hashconfig->kernel_loops_min;
+    device_param->kernel_loops_max   = hashconfig->kernel_loops_max;
+    device_param->kernel_threads_min = hashconfig->kernel_threads_min;
+    device_param->kernel_threads_max = hashconfig->kernel_threads_max;
+
+    tuning_db_entry_t *tuningdb_entry = NULL;
+
+    if (user_options->slow_candidates == true)
+    {
+      tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, 0, hashconfig->hash_mode);
+    }
+    else
+    {
+      tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
+    }
+
+    // user commandline option override tuning db
+    // but both have to stay inside the boundaries of the module
+
+    if (user_options->kernel_accel_chgd == true)
+    {
+      const u32 _kernel_accel = user_options->kernel_accel;
+
+      if ((_kernel_accel >= device_param->kernel_accel_min) && (_kernel_accel <= device_param->kernel_accel_max))
+      {
+        device_param->kernel_accel_min = _kernel_accel;
+        device_param->kernel_accel_max = _kernel_accel;
+      }
+    }
+    else
+    {
+      if (tuningdb_entry != NULL)
+      {
+        const u32 _kernel_accel = tuningdb_entry->kernel_accel;
+
+        if (_kernel_accel)
+        {
+          if ((_kernel_accel >= device_param->kernel_accel_min) && (_kernel_accel <= device_param->kernel_accel_max))
+          {
+            device_param->kernel_accel_min = _kernel_accel;
+            device_param->kernel_accel_max = _kernel_accel;
+          }
+        }
+      }
+    }
+
+    if (user_options->kernel_loops_chgd == true)
+    {
+      const u32 _kernel_loops = user_options->kernel_loops;
+
+      if ((_kernel_loops >= device_param->kernel_loops_min) && (_kernel_loops <= device_param->kernel_loops_max))
+      {
+        device_param->kernel_loops_min = _kernel_loops;
+        device_param->kernel_loops_max = _kernel_loops;
+      }
+    }
+    else
+    {
+      if (tuningdb_entry != NULL)
+      {
+        u32 _kernel_loops = tuningdb_entry->kernel_loops;
+
+        if (_kernel_loops)
+        {
+          if (user_options->workload_profile == 1)
+          {
+            _kernel_loops = (_kernel_loops > 8) ? _kernel_loops / 8 : 1;
+          }
+          else if (user_options->workload_profile == 2)
+          {
+            _kernel_loops = (_kernel_loops > 4) ? _kernel_loops / 4 : 1;
+          }
+
+          if ((_kernel_loops >= device_param->kernel_loops_min) && (_kernel_loops <= device_param->kernel_loops_max))
+          {
+            device_param->kernel_loops_min = _kernel_loops;
+            device_param->kernel_loops_max = _kernel_loops;
+          }
+        }
+      }
+    }
+
+    // there's no thread column in tuning db, stick to commandline if defined
+
+    if (user_options->kernel_threads_chgd == true)
+    {
+      const u32 _kernel_threads = user_options->kernel_threads;
+
+      if ((_kernel_threads >= device_param->kernel_threads_min) && (_kernel_threads <= device_param->kernel_threads_max))
+      {
+        device_param->kernel_threads_min = _kernel_threads;
+        device_param->kernel_threads_max = _kernel_threads;
+      }
+    }
+
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      // we have some absolute limits for fast hashes (because of limit constant memory), make sure not to overstep
+
+      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+      {
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_RULES);
+          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_RULES);
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_COMBS);
+          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_COMBS);
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_BFS);
+          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_BFS);
+        }
+      }
+    }
+
+    device_param->kernel_loops_min_sav = device_param->kernel_loops_min;
+    device_param->kernel_loops_max_sav = device_param->kernel_loops_max;
+
+    /**
+     * device properties
+     */
+
+    const u32 device_processors = device_param->device_processors;
+
+    /**
+     * create context for each device
+     */
+
+    if (device_param->is_cuda == true)
+    {
+      CU_rc = hc_cuCtxCreate (hashcat_ctx, &device_param->cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device);
+
+      if (CU_rc == -1) return -1;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      /*
+      cl_context_properties properties[3];
+
+      properties[0] = CL_CONTEXT_PLATFORM;
+      properties[1] = (cl_context_properties) device_param->opencl_platform;
+      properties[2] = 0;
+
+      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context);
+      */
+
+      CL_rc = hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context);
+
+      if (CL_rc == -1) return -1;
+
+      /**
+       * create command-queue
+       */
+
+      // not supported with NV
+      // device_param->opencl_command_queue = hc_clCreateCommandQueueWithProperties (hashcat_ctx, device_param->opencl_device, NULL);
+
+      CL_rc = hc_clCreateCommandQueue (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, CL_QUEUE_PROFILING_ENABLE, &device_param->opencl_command_queue);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    /**
+     * create stream for CUDA devices
+     */
+
+    if (device_param->is_cuda == true)
+    {
+      const int rc_cuStreamCreate = hc_cuStreamCreate (hashcat_ctx, &device_param->cuda_stream, CU_STREAM_DEFAULT);
+
+      if (rc_cuStreamCreate == -1) return -1;
+    }
+
+    /**
+     * create events for CUDA devices
+     */
+
+    if (device_param->is_cuda == true)
+    {
+      const int rc_cuEventCreate1 = hc_cuEventCreate (hashcat_ctx, &device_param->cuda_event1, CU_EVENT_DEFAULT);
+
+      if (rc_cuEventCreate1 == -1) return -1;
+
+      const int rc_cuEventCreate2 = hc_cuEventCreate (hashcat_ctx, &device_param->cuda_event2, CU_EVENT_DEFAULT);
+
+      if (rc_cuEventCreate2 == -1) return -1;
+    }
+
+    /**
+     * create input buffers on device : calculate size of fixed memory buffers
+     */
+
+    u64 size_root_css   = SP_PW_MAX *           sizeof (cs_t);
+    u64 size_markov_css = SP_PW_MAX * CHARSIZ * sizeof (cs_t);
+
+    device_param->size_root_css   = size_root_css;
+    device_param->size_markov_css = size_markov_css;
+
+    u64 size_results = sizeof (u32);
+
+    device_param->size_results = size_results;
+
+    u64 size_rules   = (u64) straight_ctx->kernel_rules_cnt * sizeof (kernel_rule_t);
+    u64 size_rules_c = (u64) KERNEL_RULES                   * sizeof (kernel_rule_t);
+
+    device_param->size_rules    = size_rules;
+    device_param->size_rules_c  = size_rules_c;
+
+    u64 size_plains  = (u64) hashes->digests_cnt * sizeof (plain_t);
+    u64 size_salts   = (u64) hashes->salts_cnt   * sizeof (salt_t);
+    u64 size_esalts  = (u64) hashes->digests_cnt * (u64) hashconfig->esalt_size;
+    u64 size_shown   = (u64) hashes->digests_cnt * sizeof (u32);
+    u64 size_digests = (u64) hashes->digests_cnt * (u64) hashconfig->dgst_size;
+
+    device_param->size_plains   = size_plains;
+    device_param->size_digests  = size_digests;
+    device_param->size_shown    = size_shown;
+    device_param->size_salts    = size_salts;
+    device_param->size_esalts   = size_esalts;
+
+    u64 size_combs = KERNEL_COMBS * sizeof (pw_t);
+    u64 size_bfs   = KERNEL_BFS   * sizeof (bf_t);
+    u64 size_tm    = 32           * sizeof (bs_word_t);
+
+    device_param->size_bfs      = size_bfs;
+    device_param->size_combs    = size_combs;
+    device_param->size_tm       = size_tm;
+
+    u64 size_st_digests = 1 * hashconfig->dgst_size;
+    u64 size_st_salts   = 1 * sizeof (salt_t);
+    u64 size_st_esalts  = 1 * hashconfig->esalt_size;
+
+    device_param->size_st_digests = size_st_digests;
+    device_param->size_st_salts   = size_st_salts;
+    device_param->size_st_esalts  = size_st_esalts;
+
+    u64 size_extra_buffer = 4;
+
+    if (module_ctx->module_extra_buffer_size != MODULE_DEFAULT)
+    {
+      const u64 extra_buffer_size = module_ctx->module_extra_buffer_size (hashconfig, user_options, user_options_extra, hashes, device_param);
+
+      if (extra_buffer_size == (u64) -1)
+      {
+        event_log_error (hashcat_ctx, "Invalid extra buffer size.");
+
+        return -1;
+      }
+
+      device_param->extra_buffer_size = extra_buffer_size;
+
+      size_extra_buffer = extra_buffer_size;
+    }
+
+    // kern type
+
+    u32 kern_type = (u32) hashconfig->kern_type;
+
+    if (module_ctx->module_kern_type_dynamic != MODULE_DEFAULT)
+    {
+      if (user_options->benchmark == true)
+      {
+      }
+      else
+      {
+        void        *digests_buf    = hashes->digests_buf;
+        salt_t      *salts_buf      = hashes->salts_buf;
+        void        *esalts_buf     = hashes->esalts_buf;
+        void        *hook_salts_buf = hashes->hook_salts_buf;
+        hashinfo_t **hash_info      = hashes->hash_info;
+
+        hashinfo_t *hash_info_ptr = NULL;
+
+        if (hash_info) hash_info_ptr = hash_info[0];
+
+        kern_type = (u32) module_ctx->module_kern_type_dynamic (hashconfig, digests_buf, salts_buf, esalts_buf, hook_salts_buf, hash_info_ptr);
+      }
+    }
+
+    // built options
+
+    const size_t build_options_sz = 4096;
+
+    char *build_options_buf = (char *) hcmalloc (build_options_sz);
+
+    int build_options_len = 0;
+
+    #if defined (_WIN)
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D KERNEL_STATIC -I OpenCL -I \"%s\" ", folder_config->cpath_real);
+    #else
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D KERNEL_STATIC -I OpenCL -I %s ", folder_config->cpath_real);
+    #endif
+
+    // we don't have sm_* on vendors not NV but it doesn't matter
+
+    #if defined (DEBUG)
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D _unroll ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
+    #else
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D _unroll -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
+    #endif
+
+    build_options_buf[build_options_len] = 0;
+
+    /*
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+    {
+      if (device_param->opencl_platform_vendor_id == VENDOR_ID_INTEL_SDK)
+      {
+        strncat (build_options_buf, " -cl-opt-disable", 16);
+      }
+    }
+    */
+
+    char *build_options_module_buf = (char *) hcmalloc (build_options_sz);
+
+    int build_options_module_len = 0;
+
+    build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s ", build_options_buf);
+
+    if (module_ctx->module_jit_build_options != MODULE_DEFAULT)
+    {
+      char *jit_build_options = module_ctx->module_jit_build_options (hashconfig, user_options, user_options_extra, hashes, device_param);
+
+      if (jit_build_options != NULL)
+      {
+        build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s", jit_build_options);
+
+        // this is a bit ugly
+        // would be better to have the module return the value as value
+
+        u32 fixed_local_size = 0;
+
+        if (sscanf (jit_build_options, "-D FIXED_LOCAL_SIZE=%u", &fixed_local_size) == 1)
+        {
+          device_param->kernel_threads_min = fixed_local_size;
+          device_param->kernel_threads_max = fixed_local_size;
+        }
+      }
+    }
+
+    build_options_module_buf[build_options_module_len] = 0;
+
+    #if defined (DEBUG)
+    if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_options '%s'", device_id + 1, build_options_buf);
+    if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_options_module '%s'", device_id + 1, build_options_module_buf);
+    #endif
+
+    /**
+     * device_name_chksum
+     */
+
+    char *device_name_chksum        = (char *) hcmalloc (HCBUFSIZ_TINY);
+    char *device_name_chksum_amp_mp = (char *) hcmalloc (HCBUFSIZ_TINY);
+
+    const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%u-%s-%s-%s-%d-%u",
+      backend_ctx->comptime,
+      backend_ctx->cuda_driver_version,
+      device_param->is_opencl,
+      device_param->opencl_platform_vendor_id,
+      device_param->device_name,
+      device_param->opencl_device_version,
+      device_param->opencl_driver_version,
+      device_param->vector_width,
+      hashconfig->kern_type);
+
+    const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%u-%s-%s-%s",
+      backend_ctx->comptime,
+      backend_ctx->cuda_driver_version,
+      device_param->is_opencl,
+      device_param->opencl_platform_vendor_id,
+      device_param->device_name,
+      device_param->opencl_device_version,
+      device_param->opencl_driver_version);
+
+    md5_ctx_t md5_ctx;
+
+    md5_init (&md5_ctx);
+    md5_update (&md5_ctx, (u32 *) device_name_chksum, dnclen);
+    md5_final (&md5_ctx);
+
+    snprintf (device_name_chksum, HCBUFSIZ_TINY, "%08x", md5_ctx.h[0]);
+
+    md5_init (&md5_ctx);
+    md5_update (&md5_ctx, (u32 *) device_name_chksum_amp_mp, dnclen_amp_mp);
+    md5_final (&md5_ctx);
+
+    snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%08x", md5_ctx.h[0]);
+
+    /**
+     * kernel cache
+     */
+
+    bool cache_disable = false;
+
+    // Seems to be completely broken on Apple + (Intel?) CPU
+    // To reproduce set cache_disable to false and run benchmark -b
+
+    if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
+    {
+      if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+      {
+        cache_disable = true;
+      }
+    }
+
+    if (module_ctx->module_jit_cache_disable != MODULE_DEFAULT)
+    {
+      cache_disable = module_ctx->module_jit_cache_disable (hashconfig, user_options, user_options_extra, hashes, device_param);
+    }
+
+    /**
+     * main kernel
+     */
+
+    {
+      /**
+       * kernel source filename
+       */
+
+      char source_file[256] = { 0 };
+
+      generate_source_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->shared_dir, source_file);
+
+      if (hc_path_read (source_file) == false)
+      {
+        event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
+
+        return -1;
+      }
+
+      /**
+       * kernel cached filename
+       */
+
+      char cached_file[256] = { 0 };
+
+      generate_cached_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->profile_dir, device_name_chksum, cached_file);
+
+      bool cached = true;
+
+      if (cache_disable == true)
+      {
+        cached = false;
+      }
+
+      if (hc_path_read (cached_file) == false)
+      {
+        cached = false;
+      }
+
+      if (hc_path_is_empty (cached_file) == true)
+      {
+        cached = false;
+      }
+
+      /**
+       * kernel compile or load
+       */
+
+      size_t kernel_lengths_buf = 0;
+
+      size_t *kernel_lengths = &kernel_lengths_buf;
+
+      char *kernel_sources_buf = NULL;
+
+      char **kernel_sources = &kernel_sources_buf;
+
+      if (cached == false)
+      {
+        #if defined (DEBUG)
+        if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Kernel %s not found in cache! Building may take a while...", device_id + 1, filename_from_filepath (cached_file));
+        #endif
+
+        const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, source_file, kernel_lengths, kernel_sources, true);
+
+        if (rc_read_kernel == false) return -1;
+
+        if (device_param->is_cuda == true)
+        {
+          nvrtcProgram program;
+
+          const int rc_nvrtcCreateProgram = hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], "main_kernel", 0, NULL, NULL);
+
+          if (rc_nvrtcCreateProgram == -1) return -1;
+
+          char **nvrtc_options = (char **) hccalloc (3 + strlen (build_options_module_buf) + 1, sizeof (char *)); // ...
+
+          nvrtc_options[0] = "--device-as-default-execution-space";
+          nvrtc_options[1] = "--gpu-architecture";
+
+          hc_asprintf (&nvrtc_options[2], "compute_%d%d", device_param->sm_major, device_param->sm_minor);
+
+          char *nvrtc_options_string = hcstrdup (build_options_module_buf);
+
+          const int num_options = 3 + nvrtc_make_options_array_from_string (nvrtc_options_string, nvrtc_options + 3);
+
+          const int rc_nvrtcCompileProgram = hc_nvrtcCompileProgram (hashcat_ctx, program, num_options, (const char * const *) nvrtc_options);
+
+          size_t build_log_size = 0;
+
+          hc_nvrtcGetProgramLogSize (hashcat_ctx, program, &build_log_size);
+
+          #if defined (DEBUG)
+          if ((build_log_size > 1) || (rc_nvrtcCompileProgram == -1))
+          #else
+          if (rc_nvrtcCompileProgram == -1)
+          #endif
+          {
+            char *build_log = (char *) hcmalloc (build_log_size + 1);
+
+            const int rc_nvrtcGetProgramLog = hc_nvrtcGetProgramLog (hashcat_ctx, program, build_log);
+
+            if (rc_nvrtcGetProgramLog == -1) return -1;
+
+            puts (build_log);
+
+            hcfree (build_log);
+          }
+
+          if (rc_nvrtcCompileProgram == -1)
+          {
+            device_param->skipped_warning = true;
+
+            event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed - proceeding without this device.", device_id + 1, source_file);
+
+            continue;
+          }
+
+          hcfree (nvrtc_options);
+          hcfree (nvrtc_options_string);
+
+          size_t binary_size;
+
+          const int rc_nvrtcGetPTXSize = hc_nvrtcGetPTXSize (hashcat_ctx, program, &binary_size);
+
+          if (rc_nvrtcGetPTXSize == -1) return -1;
+
+          char *binary = (char *) hcmalloc (binary_size);
+
+          const int nvrtcGetPTX = hc_nvrtcGetPTX (hashcat_ctx, program, binary);
+
+          if (nvrtcGetPTX == -1) return -1;
+
+          const int rc_nvrtcDestroyProgram = hc_nvrtcDestroyProgram (hashcat_ctx, &program);
+
+          if (rc_nvrtcDestroyProgram == -1) return -1;
+
+          // tbd: check for some useful options
+
+          const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, &device_param->cuda_module, binary, 0, NULL, NULL);
+
+          if (rc_cuModuleLoadDataEx == -1) return -1;
+
+          if (cache_disable == false)
+          {
+            const bool rc_write = write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
+
+            if (rc_write == false) return -1;
+          }
+
+          hcfree (binary);
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          CL_rc = hc_clCreateProgramWithSource (hashcat_ctx, device_param->opencl_context, 1, (const char **) kernel_sources, NULL, &device_param->opencl_program);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->opencl_program, 1, &device_param->opencl_device, build_options_module_buf, NULL, NULL);
+
+          //if (CL_rc == -1) return -1;
+
+          size_t build_log_size = 0;
+
+          hc_clGetProgramBuildInfo (hashcat_ctx, device_param->opencl_program, device_param->opencl_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
+
+          //if (CL_rc == -1) return -1;
+
+          #if defined (DEBUG)
+          if ((build_log_size > 1) || (CL_rc == -1))
+          #else
+          if (CL_rc == -1)
+          #endif
+          {
+            char *build_log = (char *) hcmalloc (build_log_size + 1);
+
+            const int rc_clGetProgramBuildInfo = hc_clGetProgramBuildInfo (hashcat_ctx, device_param->opencl_program, device_param->opencl_device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
+
+            if (rc_clGetProgramBuildInfo == -1) return -1;
+
+            puts (build_log);
+
+            hcfree (build_log);
+          }
+
+          if (CL_rc == -1)
+          {
+            device_param->skipped_warning = true;
+
+            event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed - proceeding without this device.", device_id + 1, source_file);
+
+            continue;
+          }
+
+          if (cache_disable == false)
+          {
+            size_t binary_size;
+
+            CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->opencl_program, CL_PROGRAM_BINARY_SIZES, sizeof (size_t), &binary_size, NULL);
+
+            if (CL_rc == -1) return -1;
+
+            char *binary = (char *) hcmalloc (binary_size);
+
+            CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->opencl_program, CL_PROGRAM_BINARIES, sizeof (char *), &binary, NULL);
+
+            if (CL_rc == -1) return -1;
+
+            const bool rc_write = write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
+
+            if (rc_write == false) return -1;
+
+            hcfree (binary);
+          }
+        }
+      }
+      else
+      {
+        const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, cached_file, kernel_lengths, kernel_sources, false);
+
+        if (rc_read_kernel == false) return -1;
+
+        if (device_param->is_cuda == true)
+        {
+          const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, &device_param->cuda_module, kernel_sources[0], 0, NULL, NULL);
+
+          if (rc_cuModuleLoadDataEx == -1) return -1;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          CL_rc = hc_clCreateProgramWithBinary (hashcat_ctx, device_param->opencl_context, 1, &device_param->opencl_device, kernel_lengths, (const unsigned char **) kernel_sources, NULL, &device_param->opencl_program);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->opencl_program, 1, &device_param->opencl_device, build_options_module_buf, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+        }
+      }
+
+      hcfree (kernel_sources[0]);
+    }
+
+    hcfree (build_options_module_buf);
+
+    /**
+     * word generator kernel
+     */
+
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      if (user_options->attack_mode != ATTACK_MODE_STRAIGHT)
+      {
+        /**
+         * kernel mp source filename
+         */
+
+        char source_file[256] = { 0 };
+
+        generate_source_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->shared_dir, source_file);
+
+        if (hc_path_read (source_file) == false)
+        {
+          event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
+
+          return -1;
+        }
+
+        /**
+         * kernel mp cached filename
+         */
+
+        char cached_file[256] = { 0 };
+
+        generate_cached_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file);
+
+        bool cached = true;
+
+        if (cache_disable == true)
+        {
+          cached = false;
+        }
+
+        if (hc_path_read (cached_file) == false)
+        {
+          cached = false;
+        }
+
+        if (hc_path_is_empty (cached_file) == true)
+        {
+          cached = false;
+        }
+
+        /**
+         * kernel compile or load
+         */
+
+        size_t kernel_lengths_buf = 0;
+
+        size_t *kernel_lengths = &kernel_lengths_buf;
+
+        char *kernel_sources_buf = NULL;
+
+        char **kernel_sources = &kernel_sources_buf;
+
+        if (cached == false)
+        {
+          #if defined (DEBUG)
+          if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Kernel %s not found in cache! Building may take a while...", device_id + 1, filename_from_filepath (cached_file));
+          #endif
+
+          const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, source_file, kernel_lengths, kernel_sources, true);
+
+          if (rc_read_kernel == false) return -1;
+
+          if (device_param->is_cuda == true)
+          {
+            nvrtcProgram program;
+
+            const int rc_nvrtcCreateProgram = hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], "mp_kernel", 0, NULL, NULL);
+
+            if (rc_nvrtcCreateProgram == -1) return -1;
+
+            char **nvrtc_options = (char **) hccalloc (3 + strlen (build_options_buf) + 1, sizeof (char *)); // ...
+
+            nvrtc_options[0] = "--device-as-default-execution-space";
+            nvrtc_options[1] = "--gpu-architecture";
+
+            hc_asprintf (&nvrtc_options[2], "compute_%d%d", device_param->sm_major, device_param->sm_minor);
+
+            char *nvrtc_options_string = hcstrdup (build_options_buf);
+
+            const int num_options = 3 + nvrtc_make_options_array_from_string (nvrtc_options_string, nvrtc_options + 3);
+
+            const int rc_nvrtcCompileProgram = hc_nvrtcCompileProgram (hashcat_ctx, program, num_options, (const char * const *) nvrtc_options);
+
+            size_t build_log_size = 0;
+
+            hc_nvrtcGetProgramLogSize (hashcat_ctx, program, &build_log_size);
+
+            #if defined (DEBUG)
+            if ((build_log_size > 1) || (rc_nvrtcCompileProgram == -1))
+            #else
+            if (rc_nvrtcCompileProgram == -1)
+            #endif
+            {
+              char *build_log = (char *) hcmalloc (build_log_size + 1);
+
+              const int rc_nvrtcGetProgramLog = hc_nvrtcGetProgramLog (hashcat_ctx, program, build_log);
+
+              if (rc_nvrtcGetProgramLog == -1) return -1;
+
+              puts (build_log);
+
+              hcfree (build_log);
+            }
+
+            if (rc_nvrtcCompileProgram == -1)
+            {
+              device_param->skipped_warning = true;
+
+              event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed - proceeding without this device.", device_id + 1, source_file);
+
+              continue;
+            }
+
+            hcfree (nvrtc_options);
+            hcfree (nvrtc_options_string);
+
+            size_t binary_size;
+
+            const int rc_nvrtcGetPTXSize = hc_nvrtcGetPTXSize (hashcat_ctx, program, &binary_size);
+
+            if (rc_nvrtcGetPTXSize == -1) return -1;
+
+            char *binary = (char *) hcmalloc (binary_size);
+
+            const int nvrtcGetPTX = hc_nvrtcGetPTX (hashcat_ctx, program, binary);
+
+            if (nvrtcGetPTX == -1) return -1;
+
+            const int rc_nvrtcDestroyProgram = hc_nvrtcDestroyProgram (hashcat_ctx, &program);
+
+            if (rc_nvrtcDestroyProgram == -1) return -1;
+
+            // tbd: check for some useful options
+
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, &device_param->cuda_module_mp, binary, 0, NULL, NULL);
+
+            if (rc_cuModuleLoadDataEx == -1) return -1;
+
+            if (cache_disable == false)
+            {
+              const bool rc_write = write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
+
+              if (rc_write == false) return -1;
+            }
+
+            hcfree (binary);
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            CL_rc = hc_clCreateProgramWithSource (hashcat_ctx, device_param->opencl_context, 1, (const char **) kernel_sources, NULL, &device_param->opencl_program_mp);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->opencl_program_mp, 1, &device_param->opencl_device, build_options_buf, NULL, NULL);
+
+            //if (CL_rc == -1) return -1;
+
+            size_t build_log_size = 0;
+
+            hc_clGetProgramBuildInfo (hashcat_ctx, device_param->opencl_program_mp, device_param->opencl_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
+
+            //if (CL_rc == -1) return -1;
+
+            #if defined (DEBUG)
+            if ((build_log_size > 1) || (CL_rc == -1))
+            #else
+            if (CL_rc == -1)
+            #endif
+            {
+              char *build_log = (char *) hcmalloc (build_log_size + 1);
+
+              const int rc_clGetProgramBuildInfo = hc_clGetProgramBuildInfo (hashcat_ctx, device_param->opencl_program_mp, device_param->opencl_device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
+
+              if (rc_clGetProgramBuildInfo == -1) return -1;
+
+              puts (build_log);
+
+              hcfree (build_log);
+            }
+
+            if (CL_rc == -1)
+            {
+              device_param->skipped_warning = true;
+
+              event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed - proceeding without this device.", device_id + 1, source_file);
+
+              continue;
+            }
+
+            if (cache_disable == false)
+            {
+              size_t binary_size;
+
+              CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->opencl_program_mp, CL_PROGRAM_BINARY_SIZES, sizeof (size_t), &binary_size, NULL);
+
+              if (CL_rc == -1) return -1;
+
+              char *binary = (char *) hcmalloc (binary_size);
+
+              CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->opencl_program_mp, CL_PROGRAM_BINARIES, sizeof (char *), &binary, NULL);
+
+              if (CL_rc == -1) return -1;
+
+              write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
+
+              hcfree (binary);
+            }
+          }
+        }
+        else
+        {
+          const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, cached_file, kernel_lengths, kernel_sources, false);
+
+          if (rc_read_kernel == false) return -1;
+
+          if (device_param->is_cuda == true)
+          {
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, &device_param->cuda_module_mp, kernel_sources[0], 0, NULL, NULL);
+
+            if (rc_cuModuleLoadDataEx == -1) return -1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            CL_rc = hc_clCreateProgramWithBinary (hashcat_ctx, device_param->opencl_context, 1, &device_param->opencl_device, kernel_lengths, (const unsigned char **) kernel_sources, NULL, &device_param->opencl_program_mp);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->opencl_program_mp, 1, &device_param->opencl_device, build_options_buf, NULL, NULL);
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+
+        hcfree (kernel_sources[0]);
+      }
+    }
+
+    /**
+     * amplifier kernel
+     */
+
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+      {
+
+      }
+      else
+      {
+        /**
+         * kernel amp source filename
+         */
+
+        char source_file[256] = { 0 };
+
+        generate_source_kernel_amp_filename (user_options_extra->attack_kern, folder_config->shared_dir, source_file);
+
+        if (hc_path_read (source_file) == false)
+        {
+          event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
+
+          return -1;
+        }
+
+        /**
+         * kernel amp cached filename
+         */
+
+        char cached_file[256] = { 0 };
+
+        generate_cached_kernel_amp_filename (user_options_extra->attack_kern, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file);
+
+        bool cached = true;
+
+        if (cache_disable == true)
+        {
+          cached = false;
+        }
+
+        if (hc_path_read (cached_file) == false)
+        {
+          cached = false;
+        }
+
+        if (hc_path_is_empty (cached_file) == true)
+        {
+          cached = false;
+        }
+
+        /**
+         * kernel compile or load
+         */
+
+        size_t kernel_lengths_buf = 0;
+
+        size_t *kernel_lengths = &kernel_lengths_buf;
+
+        char *kernel_sources_buf = NULL;
+
+        char **kernel_sources = &kernel_sources_buf;
+
+        if (cached == false)
+        {
+          #if defined (DEBUG)
+          if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Kernel %s not found in cache! Building may take a while...", device_id + 1, filename_from_filepath (cached_file));
+          #endif
+
+          const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, source_file, kernel_lengths, kernel_sources, true);
+
+          if (rc_read_kernel == false) return -1;
+
+          if (device_param->is_cuda == true)
+          {
+            nvrtcProgram program;
+
+            const int rc_nvrtcCreateProgram = hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], "mp_kernel", 0, NULL, NULL);
+
+            if (rc_nvrtcCreateProgram == -1) return -1;
+
+            char **nvrtc_options = (char **) hccalloc (3 + strlen (build_options_buf) + 1, sizeof (char *)); // ...
+
+            nvrtc_options[0] = "--device-as-default-execution-space";
+            nvrtc_options[1] = "--gpu-architecture";
+
+            hc_asprintf (&nvrtc_options[2], "compute_%d%d", device_param->sm_major, device_param->sm_minor);
+
+            char *nvrtc_options_string = hcstrdup (build_options_buf);
+
+            const int num_options = 3 + nvrtc_make_options_array_from_string (nvrtc_options_string, nvrtc_options + 3);
+
+            const int rc_nvrtcCompileProgram = hc_nvrtcCompileProgram (hashcat_ctx, program, num_options, (const char * const *) nvrtc_options);
+
+            size_t build_log_size = 0;
+
+            hc_nvrtcGetProgramLogSize (hashcat_ctx, program, &build_log_size);
+
+            #if defined (DEBUG)
+            if ((build_log_size > 1) || (rc_nvrtcCompileProgram == -1))
+            #else
+            if (rc_nvrtcCompileProgram == -1)
+            #endif
+            {
+              char *build_log = (char *) hcmalloc (build_log_size + 1);
+
+              const int rc_nvrtcGetProgramLog = hc_nvrtcGetProgramLog (hashcat_ctx, program, build_log);
+
+              if (rc_nvrtcGetProgramLog == -1) return -1;
+
+              puts (build_log);
+
+              hcfree (build_log);
+            }
+
+            if (rc_nvrtcCompileProgram == -1)
+            {
+              device_param->skipped_warning = true;
+
+              event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed - proceeding without this device.", device_id + 1, source_file);
+
+              continue;
+            }
+
+            hcfree (nvrtc_options);
+            hcfree (nvrtc_options_string);
+
+            size_t binary_size;
+
+            const int rc_nvrtcGetPTXSize = hc_nvrtcGetPTXSize (hashcat_ctx, program, &binary_size);
+
+            if (rc_nvrtcGetPTXSize == -1) return -1;
+
+            char *binary = (char *) hcmalloc (binary_size);
+
+            const int nvrtcGetPTX = hc_nvrtcGetPTX (hashcat_ctx, program, binary);
+
+            if (nvrtcGetPTX == -1) return -1;
+
+            const int rc_nvrtcDestroyProgram = hc_nvrtcDestroyProgram (hashcat_ctx, &program);
+
+            if (rc_nvrtcDestroyProgram == -1) return -1;
+
+            // tbd: check for some useful options
+
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, &device_param->cuda_module_amp, binary, 0, NULL, NULL);
+
+            if (rc_cuModuleLoadDataEx == -1) return -1;
+
+            if (cache_disable == false)
+            {
+              const bool rc_write = write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
+
+              if (rc_write == false) return -1;
+            }
+
+            hcfree (binary);
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            CL_rc = hc_clCreateProgramWithSource (hashcat_ctx, device_param->opencl_context, 1, (const char **) kernel_sources, NULL, &device_param->opencl_program_amp);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->opencl_program_amp, 1, &device_param->opencl_device, build_options_buf, NULL, NULL);
+
+            //if (CL_rc == -1) return -1;
+
+            size_t build_log_size = 0;
+
+            hc_clGetProgramBuildInfo (hashcat_ctx, device_param->opencl_program_amp, device_param->opencl_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
+
+            //if (CL_rc == -1) return -1;
+
+            #if defined (DEBUG)
+            if ((build_log_size > 1) || (CL_rc == -1))
+            #else
+            if (CL_rc == -1)
+            #endif
+            {
+              char *build_log = (char *) hcmalloc (build_log_size + 1);
+
+              const int rc_clGetProgramBuildInfo = hc_clGetProgramBuildInfo (hashcat_ctx, device_param->opencl_program_amp, device_param->opencl_device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
+
+              if (rc_clGetProgramBuildInfo == -1) return -1;
+
+              puts (build_log);
+
+              hcfree (build_log);
+            }
+
+            if (CL_rc == -1)
+            {
+              device_param->skipped_warning = true;
+
+              event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed - proceeding without this device.", device_id + 1, source_file);
+
+              continue;
+            }
+
+            if (cache_disable == false)
+            {
+              size_t binary_size;
+
+              CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->opencl_program_amp, CL_PROGRAM_BINARY_SIZES, sizeof (size_t), &binary_size, NULL);
+
+              if (CL_rc == -1) return -1;
+
+              char *binary = (char *) hcmalloc (binary_size);
+
+              CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->opencl_program_amp, CL_PROGRAM_BINARIES, sizeof (char *), &binary, NULL);
+
+              if (CL_rc == -1) return -1;
+
+              write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
+
+              hcfree (binary);
+            }
+          }
+        }
+        else
+        {
+          const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, cached_file, kernel_lengths, kernel_sources, false);
+
+          if (rc_read_kernel == false) return -1;
+
+          if (device_param->is_cuda == true)
+          {
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, &device_param->cuda_module_amp, kernel_sources[0], 0, NULL, NULL);
+
+            if (rc_cuModuleLoadDataEx == -1) return -1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            CL_rc = hc_clCreateProgramWithBinary (hashcat_ctx, device_param->opencl_context, 1, &device_param->opencl_device, kernel_lengths, (const unsigned char **) kernel_sources, NULL, &device_param->opencl_program_amp);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->opencl_program_amp, 1, &device_param->opencl_device, build_options_buf, NULL, NULL);
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+
+        hcfree (kernel_sources[0]);
+
+        hcfree (build_options_buf);
+      }
+    }
+
+    hcfree (device_name_chksum);
+    hcfree (device_name_chksum_amp_mp);
+
+    // some algorithm collide too fast, make that impossible
+
+    if (user_options->benchmark == true)
+    {
+      ((u32 *) hashes->digests_buf)[0] = -1u;
+      ((u32 *) hashes->digests_buf)[1] = -1u;
+      ((u32 *) hashes->digests_buf)[2] = -1u;
+      ((u32 *) hashes->digests_buf)[3] = -1u;
+    }
+
+    /**
+     * global buffers
+     */
+
+    if (device_param->is_cuda == true)
+    {
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_a,    bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_b,    bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_c,    bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_d,    bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_a,    bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_b,    bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_c,    bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_d,    bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_plain_bufs,     size_plains);             if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_digests_buf,    size_digests);            if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_digests_shown,  size_shown);              if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_salt_bufs,      size_salts);              if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_result,         size_results);            if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra0_buf,     size_extra_buffer / 4);   if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra1_buf,     size_extra_buffer / 4);   if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra2_buf,     size_extra_buffer / 4);   if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra3_buf,     size_extra_buffer / 4);   if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_digests_buf, size_st_digests);         if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_salts_buf,   size_st_salts);           if (CU_rc == -1) return -1;
+
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_digests_buf, hashes->digests_buf,     size_digests);            if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_salt_bufs,   hashes->salts_buf,       size_salts);              if (CU_rc == -1) return -1;
+
+      /**
+       * special buffers
+       */
+
+      if (user_options->slow_candidates == true)
+      {
+        CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules_c, size_rules_c); if (CU_rc == -1) return -1;
+      }
+      else
+      {
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules,   size_rules);   if (CU_rc == -1) return -1;
+
+          if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+          {
+            size_t dummy;
+
+            CU_rc = hc_cuModuleGetGlobal (hashcat_ctx, &device_param->cuda_d_rules_c, &dummy, device_param->cuda_module, "generic_constant"); if (CU_rc == -1) return -1;
+          }
+          else
+          {
+            CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules_c, size_rules_c); if (CU_rc == -1) return -1;
+          }
+
+          CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_rules, straight_ctx->kernel_rules_buf, size_rules); if (CU_rc == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_combs,          size_combs);      if (CU_rc == -1) return -1;
+          CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_combs_c,        size_combs);      if (CU_rc == -1) return -1;
+          CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_root_css_buf,   size_root_css);   if (CU_rc == -1) return -1;
+          CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_markov_css_buf, size_markov_css); if (CU_rc == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bfs,            size_bfs);        if (CU_rc == -1) return -1;
+          CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_root_css_buf,   size_root_css);   if (CU_rc == -1) return -1;
+          CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_markov_css_buf, size_markov_css); if (CU_rc == -1) return -1;
+
+          if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+          {
+            size_t dummy;
+
+            CU_rc = hc_cuModuleGetGlobal (hashcat_ctx, &device_param->cuda_d_bfs_c, &dummy, device_param->cuda_module, "generic_constant"); if (CU_rc == -1) return -1;
+            CU_rc = hc_cuModuleGetGlobal (hashcat_ctx, &device_param->cuda_d_tm_c,  &dummy, device_param->cuda_module, "generic_constant"); if (CU_rc == -1) return -1;
+          }
+          else
+          {
+            CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bfs_c,          size_bfs);        if (CU_rc == -1) return -1;
+            CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_tm_c,           size_tm);         if (CU_rc == -1) return -1;
+          }
+        }
+      }
+
+      if (size_esalts)
+      {
+        CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_esalt_bufs, size_esalts);
+
+        if (CU_rc == -1) return -1;
+
+        CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_esalt_bufs, hashes->esalts_buf, size_esalts);
+
+        if (CU_rc == -1) return -1;
+      }
+
+      if (hashconfig->st_hash != NULL)
+      {
+        CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_digests_buf, hashes->st_digests_buf, size_st_digests); if (CU_rc == -1) return -1;
+        CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts);   if (CU_rc == -1) return -1;
+
+        if (size_esalts)
+        {
+          CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_esalts_buf, size_st_esalts);
+
+          if (CU_rc == -1) return -1;
+
+          CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts);
+
+          if (CU_rc == -1) return -1;
+        }
+      }
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_a);    if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_b);    if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_c);    if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_d);    if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_a);    if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_b);    if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_c);    if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_d);    if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_plains,             NULL, &device_param->opencl_d_plain_bufs);     if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_digests,            NULL, &device_param->opencl_d_digests_buf);    if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_shown,              NULL, &device_param->opencl_d_digests_shown);  if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_salts,              NULL, &device_param->opencl_d_salt_bufs);      if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_results,            NULL, &device_param->opencl_d_result);         if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra0_buf);     if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra1_buf);     if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra2_buf);     if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra3_buf);     if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_digests,         NULL, &device_param->opencl_d_st_digests_buf); if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_salts,           NULL, &device_param->opencl_d_st_salts_buf);   if (CL_rc == -1) return -1;
+
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_a, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_a, 0, NULL, NULL); if (CL_rc == -1) return -1;
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_b, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_b, 0, NULL, NULL); if (CL_rc == -1) return -1;
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_c, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_c, 0, NULL, NULL); if (CL_rc == -1) return -1;
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_d, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_d, 0, NULL, NULL); if (CL_rc == -1) return -1;
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_a, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_a, 0, NULL, NULL); if (CL_rc == -1) return -1;
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_b, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_b, 0, NULL, NULL); if (CL_rc == -1) return -1;
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_c, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_c, 0, NULL, NULL); if (CL_rc == -1) return -1;
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_d, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_d, 0, NULL, NULL); if (CL_rc == -1) return -1;
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_digests_buf, CL_TRUE, 0, size_digests,            hashes->digests_buf,     0, NULL, NULL); if (CL_rc == -1) return -1;
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_salt_bufs,   CL_TRUE, 0, size_salts,              hashes->salts_buf,       0, NULL, NULL); if (CL_rc == -1) return -1;
+
+      /**
+       * special buffers
+       */
+
+      if (user_options->slow_candidates == true)
+      {
+        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules_c, NULL, &device_param->opencl_d_rules_c); if (CL_rc == -1) return -1;
+      }
+      else
+      {
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules,   NULL, &device_param->opencl_d_rules);   if (CL_rc == -1) return -1;
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules_c, NULL, &device_param->opencl_d_rules_c); if (CL_rc == -1) return -1;
+
+          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, CL_TRUE, 0, size_rules, straight_ctx->kernel_rules_buf, 0, NULL, NULL); if (CL_rc == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_combs,      NULL, &device_param->opencl_d_combs);          if (CL_rc == -1) return -1;
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_combs,      NULL, &device_param->opencl_d_combs_c);        if (CL_rc == -1) return -1;
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_root_css,   NULL, &device_param->opencl_d_root_css_buf);   if (CL_rc == -1) return -1;
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_markov_css, NULL, &device_param->opencl_d_markov_css_buf); if (CL_rc == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_bfs,        NULL, &device_param->opencl_d_bfs);            if (CL_rc == -1) return -1;
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_bfs,        NULL, &device_param->opencl_d_bfs_c);          if (CL_rc == -1) return -1;
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_tm,         NULL, &device_param->opencl_d_tm_c);           if (CL_rc == -1) return -1;
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_root_css,   NULL, &device_param->opencl_d_root_css_buf);   if (CL_rc == -1) return -1;
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_markov_css, NULL, &device_param->opencl_d_markov_css_buf); if (CL_rc == -1) return -1;
+        }
+      }
+
+      if (size_esalts)
+      {
+        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_esalts, NULL, &device_param->opencl_d_esalt_bufs);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_esalt_bufs, CL_TRUE, 0, size_esalts, hashes->esalts_buf, 0, NULL, NULL);
+
+        if (CL_rc == -1) return -1;
+      }
+
+      if (hashconfig->st_hash != NULL)
+      {
+        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_digests_buf,  CL_TRUE, 0, size_st_digests,         hashes->st_digests_buf,  0, NULL, NULL); if (CL_rc == -1) return -1;
+        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_salts_buf,    CL_TRUE, 0, size_st_salts,           hashes->st_salts_buf,    0, NULL, NULL); if (CL_rc == -1) return -1;
+
+        if (size_esalts)
+        {
+          CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_st_esalts, NULL, &device_param->opencl_d_st_esalts_buf);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_esalts_buf, CL_TRUE, 0, size_st_esalts, hashes->st_esalts_buf, 0, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+        }
+      }
+    }
+
+    /**
+     * kernel args
+     */
+
+    device_param->kernel_params_buf32[24] = bitmap_ctx->bitmap_mask;
+    device_param->kernel_params_buf32[25] = bitmap_ctx->bitmap_shift1;
+    device_param->kernel_params_buf32[26] = bitmap_ctx->bitmap_shift2;
+    device_param->kernel_params_buf32[27] = 0; // salt_pos
+    device_param->kernel_params_buf32[28] = 0; // loop_pos
+    device_param->kernel_params_buf32[29] = 0; // loop_cnt
+    device_param->kernel_params_buf32[30] = 0; // kernel_rules_cnt
+    device_param->kernel_params_buf32[31] = 0; // digests_cnt
+    device_param->kernel_params_buf32[32] = 0; // digests_offset
+    device_param->kernel_params_buf32[33] = 0; // combs_mode
+    device_param->kernel_params_buf64[34] = 0; // gid_max
+
+    if (device_param->is_cuda == true)
+    {
+      device_param->kernel_params[ 0] = NULL; // &device_param->cuda_d_pws_buf;
+      device_param->kernel_params[ 1] = &device_param->cuda_d_rules_c;
+      device_param->kernel_params[ 2] = &device_param->cuda_d_combs_c;
+      device_param->kernel_params[ 3] = &device_param->cuda_d_bfs_c;
+      device_param->kernel_params[ 4] = NULL; // &device_param->cuda_d_tmps;
+      device_param->kernel_params[ 5] = NULL; // &device_param->cuda_d_hooks;
+      device_param->kernel_params[ 6] = &device_param->cuda_d_bitmap_s1_a;
+      device_param->kernel_params[ 7] = &device_param->cuda_d_bitmap_s1_b;
+      device_param->kernel_params[ 8] = &device_param->cuda_d_bitmap_s1_c;
+      device_param->kernel_params[ 9] = &device_param->cuda_d_bitmap_s1_d;
+      device_param->kernel_params[10] = &device_param->cuda_d_bitmap_s2_a;
+      device_param->kernel_params[11] = &device_param->cuda_d_bitmap_s2_b;
+      device_param->kernel_params[12] = &device_param->cuda_d_bitmap_s2_c;
+      device_param->kernel_params[13] = &device_param->cuda_d_bitmap_s2_d;
+      device_param->kernel_params[14] = &device_param->cuda_d_plain_bufs;
+      device_param->kernel_params[15] = &device_param->cuda_d_digests_buf;
+      device_param->kernel_params[16] = &device_param->cuda_d_digests_shown;
+      device_param->kernel_params[17] = &device_param->cuda_d_salt_bufs;
+      device_param->kernel_params[18] = &device_param->cuda_d_esalt_bufs;
+      device_param->kernel_params[19] = &device_param->cuda_d_result;
+      device_param->kernel_params[20] = &device_param->cuda_d_extra0_buf;
+      device_param->kernel_params[21] = &device_param->cuda_d_extra1_buf;
+      device_param->kernel_params[22] = &device_param->cuda_d_extra2_buf;
+      device_param->kernel_params[23] = &device_param->cuda_d_extra3_buf;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      device_param->kernel_params[ 0] = NULL; // &device_param->opencl_d_pws_buf;
+      device_param->kernel_params[ 1] = &device_param->opencl_d_rules_c;
+      device_param->kernel_params[ 2] = &device_param->opencl_d_combs_c;
+      device_param->kernel_params[ 3] = &device_param->opencl_d_bfs_c;
+      device_param->kernel_params[ 4] = NULL; // &device_param->opencl_d_tmps;
+      device_param->kernel_params[ 5] = NULL; // &device_param->opencl_d_hooks;
+      device_param->kernel_params[ 6] = &device_param->opencl_d_bitmap_s1_a;
+      device_param->kernel_params[ 7] = &device_param->opencl_d_bitmap_s1_b;
+      device_param->kernel_params[ 8] = &device_param->opencl_d_bitmap_s1_c;
+      device_param->kernel_params[ 9] = &device_param->opencl_d_bitmap_s1_d;
+      device_param->kernel_params[10] = &device_param->opencl_d_bitmap_s2_a;
+      device_param->kernel_params[11] = &device_param->opencl_d_bitmap_s2_b;
+      device_param->kernel_params[12] = &device_param->opencl_d_bitmap_s2_c;
+      device_param->kernel_params[13] = &device_param->opencl_d_bitmap_s2_d;
+      device_param->kernel_params[14] = &device_param->opencl_d_plain_bufs;
+      device_param->kernel_params[15] = &device_param->opencl_d_digests_buf;
+      device_param->kernel_params[16] = &device_param->opencl_d_digests_shown;
+      device_param->kernel_params[17] = &device_param->opencl_d_salt_bufs;
+      device_param->kernel_params[18] = &device_param->opencl_d_esalt_bufs;
+      device_param->kernel_params[19] = &device_param->opencl_d_result;
+      device_param->kernel_params[20] = &device_param->opencl_d_extra0_buf;
+      device_param->kernel_params[21] = &device_param->opencl_d_extra1_buf;
+      device_param->kernel_params[22] = &device_param->opencl_d_extra2_buf;
+      device_param->kernel_params[23] = &device_param->opencl_d_extra3_buf;
+    }
+
+    device_param->kernel_params[24] = &device_param->kernel_params_buf32[24];
+    device_param->kernel_params[25] = &device_param->kernel_params_buf32[25];
+    device_param->kernel_params[26] = &device_param->kernel_params_buf32[26];
+    device_param->kernel_params[27] = &device_param->kernel_params_buf32[27];
+    device_param->kernel_params[28] = &device_param->kernel_params_buf32[28];
+    device_param->kernel_params[29] = &device_param->kernel_params_buf32[29];
+    device_param->kernel_params[30] = &device_param->kernel_params_buf32[30];
+    device_param->kernel_params[31] = &device_param->kernel_params_buf32[31];
+    device_param->kernel_params[32] = &device_param->kernel_params_buf32[32];
+    device_param->kernel_params[33] = &device_param->kernel_params_buf32[33];
+    device_param->kernel_params[34] = &device_param->kernel_params_buf64[34];
+
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      device_param->kernel_params_mp_buf64[3] = 0;
+      device_param->kernel_params_mp_buf32[4] = 0;
+      device_param->kernel_params_mp_buf32[5] = 0;
+      device_param->kernel_params_mp_buf32[6] = 0;
+      device_param->kernel_params_mp_buf32[7] = 0;
+      device_param->kernel_params_mp_buf64[8] = 0;
+
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        if (device_param->is_cuda == true)
+        {
+          device_param->kernel_params_mp[0] = &device_param->cuda_d_combs;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          device_param->kernel_params_mp[0] = &device_param->opencl_d_combs;
+        }
+      }
+      else
+      {
+        if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+        {
+          if (device_param->is_cuda == true)
+          {
+            device_param->kernel_params_mp[0] = &device_param->cuda_d_combs;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            device_param->kernel_params_mp[0] = &device_param->opencl_d_combs;
+          }
+        }
+        else
+        {
+          device_param->kernel_params_mp[0] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                    // ? &device_param->opencl_d_pws_buf
+                                                    // : &device_param->opencl_d_pws_amp_buf;
+        }
+      }
+
+      if (device_param->is_cuda == true)
+      {
+        device_param->kernel_params_mp[1] = &device_param->cuda_d_root_css_buf;
+        device_param->kernel_params_mp[2] = &device_param->cuda_d_markov_css_buf;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        device_param->kernel_params_mp[1] = &device_param->opencl_d_root_css_buf;
+        device_param->kernel_params_mp[2] = &device_param->opencl_d_markov_css_buf;
+      }
+
+      device_param->kernel_params_mp[3] = &device_param->kernel_params_mp_buf64[3];
+      device_param->kernel_params_mp[4] = &device_param->kernel_params_mp_buf32[4];
+      device_param->kernel_params_mp[5] = &device_param->kernel_params_mp_buf32[5];
+      device_param->kernel_params_mp[6] = &device_param->kernel_params_mp_buf32[6];
+      device_param->kernel_params_mp[7] = &device_param->kernel_params_mp_buf32[7];
+      device_param->kernel_params_mp[8] = &device_param->kernel_params_mp_buf64[8];
+
+      device_param->kernel_params_mp_l_buf64[3] = 0;
+      device_param->kernel_params_mp_l_buf32[4] = 0;
+      device_param->kernel_params_mp_l_buf32[5] = 0;
+      device_param->kernel_params_mp_l_buf32[6] = 0;
+      device_param->kernel_params_mp_l_buf32[7] = 0;
+      device_param->kernel_params_mp_l_buf32[8] = 0;
+      device_param->kernel_params_mp_l_buf64[9] = 0;
+
+      device_param->kernel_params_mp_l[0] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                  // ? &device_param->opencl_d_pws_buf
+                                                  // : &device_param->opencl_d_pws_amp_buf;
+      if (device_param->is_cuda == true)
+      {
+        device_param->kernel_params_mp_l[1] = &device_param->cuda_d_root_css_buf;
+        device_param->kernel_params_mp_l[2] = &device_param->cuda_d_markov_css_buf;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        device_param->kernel_params_mp_l[1] = &device_param->opencl_d_root_css_buf;
+        device_param->kernel_params_mp_l[2] = &device_param->opencl_d_markov_css_buf;
+      }
+
+      device_param->kernel_params_mp_l[3] = &device_param->kernel_params_mp_l_buf64[3];
+      device_param->kernel_params_mp_l[4] = &device_param->kernel_params_mp_l_buf32[4];
+      device_param->kernel_params_mp_l[5] = &device_param->kernel_params_mp_l_buf32[5];
+      device_param->kernel_params_mp_l[6] = &device_param->kernel_params_mp_l_buf32[6];
+      device_param->kernel_params_mp_l[7] = &device_param->kernel_params_mp_l_buf32[7];
+      device_param->kernel_params_mp_l[8] = &device_param->kernel_params_mp_l_buf32[8];
+      device_param->kernel_params_mp_l[9] = &device_param->kernel_params_mp_l_buf64[9];
+
+      device_param->kernel_params_mp_r_buf64[3] = 0;
+      device_param->kernel_params_mp_r_buf32[4] = 0;
+      device_param->kernel_params_mp_r_buf32[5] = 0;
+      device_param->kernel_params_mp_r_buf32[6] = 0;
+      device_param->kernel_params_mp_r_buf32[7] = 0;
+      device_param->kernel_params_mp_r_buf64[8] = 0;
+
+      if (device_param->is_cuda == true)
+      {
+        device_param->kernel_params_mp_r[0] = &device_param->cuda_d_bfs;
+        device_param->kernel_params_mp_r[1] = &device_param->cuda_d_root_css_buf;
+        device_param->kernel_params_mp_r[2] = &device_param->cuda_d_markov_css_buf;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        device_param->kernel_params_mp_r[0] = &device_param->opencl_d_bfs;
+        device_param->kernel_params_mp_r[1] = &device_param->opencl_d_root_css_buf;
+        device_param->kernel_params_mp_r[2] = &device_param->opencl_d_markov_css_buf;
+      }
+
+      device_param->kernel_params_mp_r[3] = &device_param->kernel_params_mp_r_buf64[3];
+      device_param->kernel_params_mp_r[4] = &device_param->kernel_params_mp_r_buf32[4];
+      device_param->kernel_params_mp_r[5] = &device_param->kernel_params_mp_r_buf32[5];
+      device_param->kernel_params_mp_r[6] = &device_param->kernel_params_mp_r_buf32[6];
+      device_param->kernel_params_mp_r[7] = &device_param->kernel_params_mp_r_buf32[7];
+      device_param->kernel_params_mp_r[8] = &device_param->kernel_params_mp_r_buf64[8];
+
+      device_param->kernel_params_amp_buf32[5] = 0; // combs_mode
+      device_param->kernel_params_amp_buf64[6] = 0; // gid_max
+
+      if (device_param->is_cuda == true)
+      {
+        device_param->kernel_params_amp[0] = NULL; // &device_param->cuda_d_pws_buf;
+        device_param->kernel_params_amp[1] = NULL; // &device_param->cuda_d_pws_amp_buf;
+        device_param->kernel_params_amp[2] = &device_param->cuda_d_rules_c;
+        device_param->kernel_params_amp[3] = &device_param->cuda_d_combs_c;
+        device_param->kernel_params_amp[4] = &device_param->cuda_d_bfs_c;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        device_param->kernel_params_amp[0] = NULL; // &device_param->opencl_d_pws_buf;
+        device_param->kernel_params_amp[1] = NULL; // &device_param->opencl_d_pws_amp_buf;
+        device_param->kernel_params_amp[2] = &device_param->opencl_d_rules_c;
+        device_param->kernel_params_amp[3] = &device_param->opencl_d_combs_c;
+        device_param->kernel_params_amp[4] = &device_param->opencl_d_bfs_c;
+      }
+
+      device_param->kernel_params_amp[5] = &device_param->kernel_params_amp_buf32[5];
+      device_param->kernel_params_amp[6] = &device_param->kernel_params_amp_buf64[6];
+
+      if (device_param->is_cuda == true)
+      {
+        device_param->kernel_params_tm[0] = &device_param->cuda_d_bfs_c;
+        device_param->kernel_params_tm[1] = &device_param->cuda_d_tm_c;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        device_param->kernel_params_tm[0] = &device_param->opencl_d_bfs_c;
+        device_param->kernel_params_tm[1] = &device_param->opencl_d_tm_c;
+      }
+    }
+
+    device_param->kernel_params_memset_buf32[1] = 0; // value
+    device_param->kernel_params_memset_buf64[2] = 0; // gid_max
+
+    device_param->kernel_params_memset[0] = NULL;
+    device_param->kernel_params_memset[1] = &device_param->kernel_params_memset_buf32[1];
+    device_param->kernel_params_memset[2] = &device_param->kernel_params_memset_buf64[2];
+
+    device_param->kernel_params_atinit_buf64[1] = 0; // gid_max
+
+    device_param->kernel_params_atinit[0] = NULL;
+    device_param->kernel_params_atinit[1] = &device_param->kernel_params_atinit_buf64[1];
+
+    device_param->kernel_params_decompress_buf64[3] = 0; // gid_max
+
+    if (device_param->is_cuda == true)
+    {
+      device_param->kernel_params_decompress[0] = NULL; // &device_param->cuda_d_pws_idx;
+      device_param->kernel_params_decompress[1] = NULL; // &device_param->cuda_d_pws_comp_buf;
+      device_param->kernel_params_decompress[2] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                        // ? &device_param->cuda_d_pws_buf
+                                                        // : &device_param->cuda_d_pws_amp_buf;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      device_param->kernel_params_decompress[0] = NULL; // &device_param->opencl_d_pws_idx;
+      device_param->kernel_params_decompress[1] = NULL; // &device_param->opencl_d_pws_comp_buf;
+      device_param->kernel_params_decompress[2] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                        // ? &device_param->opencl_d_pws_buf
+                                                        // : &device_param->opencl_d_pws_amp_buf;
+    }
+
+    device_param->kernel_params_decompress[3] = &device_param->kernel_params_decompress_buf64[3];
+
+    /**
+     * kernel name
+     */
+
+    if (device_param->is_cuda == true)
+    {
+      char kernel_name[64] = { 0 };
+
+      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SINGLE_HASH)
+        {
+          if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+          {
+            // kernel1
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 4);
+
+            CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function1, device_param->cuda_module, kernel_name);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_wgs1);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size;
+
+            if (CL_rc == -1) return -1;
+
+            // kernel2
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 8);
+
+            CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2, device_param->cuda_module, kernel_name);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_wgs2);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size;
+
+            if (CL_rc == -1) return -1;
+
+            // kernel3
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 16);
+
+            CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function3, device_param->cuda_module, kernel_name);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_wgs3);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size;
+
+            if (CL_rc == -1) return -1;
+          }
+          else
+          {
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_sxx", kern_type);
+
+            CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function4, device_param->cuda_module, kernel_name);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_wgs4);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_local_mem_size4);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple4 = device_param->cuda_warp_size;
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+        else
+        {
+          if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+          {
+            // kernel1
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 4);
+
+            CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function1, device_param->cuda_module, kernel_name);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_wgs1);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size;
+
+            if (CL_rc == -1) return -1;
+
+            // kernel2
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 8);
+
+            CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2, device_param->cuda_module, kernel_name);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_wgs2);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size;
+
+            if (CL_rc == -1) return -1;
+
+            // kernel3
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 16);
+
+            CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function3, device_param->cuda_module, kernel_name);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_wgs3);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size;
+
+            if (CL_rc == -1) return -1;
+          }
+          else
+          {
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_mxx", kern_type);
+
+            CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function4, device_param->cuda_module, kernel_name);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_wgs4);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_local_mem_size4);
+
+            if (CL_rc == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple4 = device_param->cuda_warp_size;
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+
+        if (user_options->slow_candidates == true)
+        {
+        }
+        else
+        {
+          if (user_options->attack_mode == ATTACK_MODE_BF)
+          {
+            if (hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE)
+            {
+              snprintf (kernel_name, sizeof (kernel_name), "m%05u_tm", kern_type);
+
+              CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_tm, device_param->cuda_module, kernel_name);
+
+              if (CL_rc == -1) return -1;
+
+              CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_tm, &device_param->kernel_wgs_tm);
+
+              if (CL_rc == -1) return -1;
+
+              CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_tm, &device_param->kernel_local_mem_size_tm);
+
+              if (CL_rc == -1) return -1;
+
+              device_param->kernel_preferred_wgs_multiple_tm = device_param->cuda_warp_size;
+
+              if (CL_rc == -1) return -1;
+            }
+          }
+        }
+      }
+      else
+      {
+        // kernel1
+
+        snprintf (kernel_name, sizeof (kernel_name), "m%05u_init", kern_type);
+
+        CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function1, device_param->cuda_module, kernel_name);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_wgs1);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size;
+
+        if (CL_rc == -1) return -1;
+
+        // kernel2
+
+        snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop", kern_type);
+
+        CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2, device_param->cuda_module, kernel_name);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_wgs2);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size;
+
+        if (CL_rc == -1) return -1;
+
+        // kernel3
+
+        snprintf (kernel_name, sizeof (kernel_name), "m%05u_comp", kern_type);
+
+        CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function3, device_param->cuda_module, kernel_name);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_wgs3);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3);
+
+        if (CL_rc == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size;
+
+        if (CL_rc == -1) return -1;
+
+        // kernel12
+
+        if (hashconfig->opts_type & OPTS_TYPE_HOOK12)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_hook12", kern_type);
+
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function12, device_param->cuda_module, kernel_name);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function12, &device_param->kernel_wgs12);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function12, &device_param->kernel_local_mem_size12);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple12 = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // kernel23
+
+        if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_hook23", kern_type);
+
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function23, device_param->cuda_module, kernel_name);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function23, &device_param->kernel_wgs23);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function23, &device_param->kernel_local_mem_size23);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple23 = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // init2
+
+        if (hashconfig->opts_type & OPTS_TYPE_INIT2)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_init2", kern_type);
+
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_init2, device_param->cuda_module, kernel_name);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_init2, &device_param->kernel_wgs_init2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_init2, &device_param->kernel_local_mem_size_init2);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_init2 = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // loop2
+
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop2", kern_type);
+
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_loop2, device_param->cuda_module, kernel_name);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_loop2, &device_param->kernel_wgs_loop2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2, &device_param->kernel_local_mem_size_loop2);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_loop2 = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // aux1
+
+        if (hashconfig->opts_type & OPTS_TYPE_AUX1)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux1", kern_type);
+
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux1, device_param->cuda_module, kernel_name);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux1, &device_param->kernel_wgs_aux1);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux1, &device_param->kernel_local_mem_size_aux1);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_aux1 = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // aux2
+
+        if (hashconfig->opts_type & OPTS_TYPE_AUX2)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux2", kern_type);
+
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux2, device_param->cuda_module, kernel_name);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux2, &device_param->kernel_wgs_aux2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux2, &device_param->kernel_local_mem_size_aux2);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_aux2 = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // aux3
+
+        if (hashconfig->opts_type & OPTS_TYPE_AUX3)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux3", kern_type);
+
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux3, device_param->cuda_module, kernel_name);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux3, &device_param->kernel_wgs_aux3);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux3, &device_param->kernel_local_mem_size_aux3);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_aux3 = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // aux4
+
+        if (hashconfig->opts_type & OPTS_TYPE_AUX4)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux4", kern_type);
+
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux4, device_param->cuda_module, kernel_name);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux4, &device_param->kernel_wgs_aux4);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux4, &device_param->kernel_local_mem_size_aux4);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_aux4 = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+        }
+      }
+
+      // GPU memset
+
+      CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_memset, device_param->cuda_module, "gpu_memset");
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_wgs_memset);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_local_mem_size_memset);
+
+      if (CL_rc == -1) return -1;
+
+      device_param->kernel_preferred_wgs_multiple_memset = device_param->cuda_warp_size;
+
+      if (CL_rc == -1) return -1;
+
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem),   device_param->kernel_params_memset[0]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1;
+
+      // GPU autotune init
+
+      CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_atinit, device_param->cuda_module, "gpu_atinit");
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_wgs_atinit);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_local_mem_size_atinit);
+
+      if (CL_rc == -1) return -1;
+
+      device_param->kernel_preferred_wgs_multiple_atinit = device_param->cuda_warp_size;
+
+      if (CL_rc == -1) return -1;
+
+      // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem),   device_param->kernel_params_atinit[0]); if (CL_rc == -1) return -1;
+      // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]); if (CL_rc == -1) return -1;
+
+      // GPU decompress
+
+      CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_decompress, device_param->cuda_module, "gpu_decompress");
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_wgs_decompress);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_local_mem_size_decompress);
+
+      if (CL_rc == -1) return -1;
+
+      device_param->kernel_preferred_wgs_multiple_decompress = device_param->cuda_warp_size;
+
+      if (CL_rc == -1) return -1;
+
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem),   device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem),   device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem),   device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]); if (CL_rc == -1) return -1;
+
+      // MP start
+
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        if (user_options->attack_mode == ATTACK_MODE_BF)
+        {
+          // mp_l
+
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp_l, device_param->cuda_module_mp, "l_markov");
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp_l, &device_param->kernel_wgs_mp_l);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_l, &device_param->kernel_local_mem_size_mp_l);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_mp_l = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+
+          // mp_r
+
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp_r, device_param->cuda_module_mp, "r_markov");
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp_r, &device_param->kernel_wgs_mp_r);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_r, &device_param->kernel_local_mem_size_mp_r);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_mp_r = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+
+          if (hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE)
+          {
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_tm, 0, sizeof (cl_mem), device_param->kernel_params_tm[0]); if (CL_rc == -1) return -1;
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_tm, 1, sizeof (cl_mem), device_param->kernel_params_tm[1]); if (CL_rc == -1) return -1;
+          }
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+        {
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp, device_param->cuda_module_mp, "C_markov");
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_wgs_mp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_local_mem_size_mp);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_mp = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+        {
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp, device_param->cuda_module_mp, "C_markov");
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_wgs_mp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_local_mem_size_mp);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_mp = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+        }
+      }
+
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+        {
+          // nothing to do
+        }
+        else
+        {
+          CL_rc = hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_amp, device_param->cuda_module_amp, "amp");
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_amp, &device_param->kernel_wgs_amp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_amp, &device_param->kernel_local_mem_size_amp);
+
+          if (CL_rc == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_amp = device_param->cuda_warp_size;
+
+          if (CL_rc == -1) return -1;
+        }
+
+        if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+        {
+          // nothing to do
+        }
+        else
+        {
+          for (u32 i = 0; i < 5; i++)
+          {
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, i, sizeof (cl_mem), device_param->kernel_params_amp[i]);
+
+            //if (CL_rc == -1) return -1;
+          }
+
+          for (u32 i = 5; i < 6; i++)
+          {
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, i, sizeof (cl_uint), device_param->kernel_params_amp[i]);
+
+            //if (CL_rc == -1) return -1;
+          }
+
+          for (u32 i = 6; i < 7; i++)
+          {
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, i, sizeof (cl_ulong), device_param->kernel_params_amp[i]);
+
+            //if (CL_rc == -1) return -1;
+          }
+        }
+      }
+
+      // zero some data buffers
+
+      CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_plain_bufs,    device_param->size_plains);   if (CU_rc == -1) return -1;
+      CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_digests_shown, device_param->size_shown);    if (CU_rc == -1) return -1;
+      CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_result,        device_param->size_results);  if (CU_rc == -1) return -1;
+
+      /**
+       * special buffers
+       */
+
+      if (user_options->slow_candidates == true)
+      {
+        CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_rules_c, size_rules_c); if (CU_rc == -1) return -1;
+      }
+      else
+      {
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_rules_c, size_rules_c); if (CU_rc == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_combs,          size_combs);       if (CU_rc == -1) return -1;
+          CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_combs_c,        size_combs);       if (CU_rc == -1) return -1;
+          CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_root_css_buf,   size_root_css);    if (CU_rc == -1) return -1;
+          CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_markov_css_buf, size_markov_css);  if (CU_rc == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_bfs,            size_bfs);         if (CU_rc == -1) return -1;
+          CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_bfs_c,          size_bfs);         if (CU_rc == -1) return -1;
+          CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tm_c,           size_tm);          if (CU_rc == -1) return -1;
+          CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_root_css_buf,   size_root_css);    if (CU_rc == -1) return -1;
+          CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_markov_css_buf, size_markov_css);  if (CU_rc == -1) return -1;
+        }
+      }
+
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        if ((user_options->attack_mode == ATTACK_MODE_HYBRID1) || (user_options->attack_mode == ATTACK_MODE_HYBRID2))
+        {
+          /**
+           * prepare mp
+           */
+
+          if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+          {
+            device_param->kernel_params_mp_buf32[5] = 0;
+            device_param->kernel_params_mp_buf32[6] = 0;
+            device_param->kernel_params_mp_buf32[7] = 0;
+
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)     device_param->kernel_params_mp_buf32[5] = full01;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)     device_param->kernel_params_mp_buf32[5] = full06;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)     device_param->kernel_params_mp_buf32[5] = full80;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS14) device_param->kernel_params_mp_buf32[6] = 1;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS15) device_param->kernel_params_mp_buf32[7] = 1;
+          }
+          else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+          {
+            device_param->kernel_params_mp_buf32[5] = 0;
+            device_param->kernel_params_mp_buf32[6] = 0;
+            device_param->kernel_params_mp_buf32[7] = 0;
+          }
+
+          //for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, i, sizeof (cl_mem), device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_BF)
+        {
+          /**
+           * prepare mp_r and mp_l
+           */
+
+          device_param->kernel_params_mp_l_buf32[6] = 0;
+          device_param->kernel_params_mp_l_buf32[7] = 0;
+          device_param->kernel_params_mp_l_buf32[8] = 0;
+
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)     device_param->kernel_params_mp_l_buf32[6] = full01;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)     device_param->kernel_params_mp_l_buf32[6] = full06;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)     device_param->kernel_params_mp_l_buf32[6] = full80;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS14) device_param->kernel_params_mp_l_buf32[7] = 1;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS15) device_param->kernel_params_mp_l_buf32[8] = 1;
+
+          //for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_mem), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+          //for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_mem), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+        }
+      }
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      char kernel_name[64] = { 0 };
+
+      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SINGLE_HASH)
+        {
+          if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+          {
+            // kernel1
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 4);
+
+            CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel1);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_wgs1);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_local_mem_size1);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_preferred_wgs_multiple1);
+
+            if (CL_rc == -1) return -1;
+
+            // kernel2
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 8);
+
+            CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel2);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_wgs2);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_local_mem_size2);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_preferred_wgs_multiple2);
+
+            if (CL_rc == -1) return -1;
+
+            // kernel3
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 16);
+
+            CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel3);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_wgs3);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_local_mem_size3);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_preferred_wgs_multiple3);
+
+            if (CL_rc == -1) return -1;
+          }
+          else
+          {
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_sxx", kern_type);
+
+            CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel4);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_wgs4);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_local_mem_size4);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_preferred_wgs_multiple4);
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+        else
+        {
+          if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+          {
+            // kernel1
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 4);
+
+            CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel1);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_wgs1);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_local_mem_size1);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_preferred_wgs_multiple1);
+
+            if (CL_rc == -1) return -1;
+
+            // kernel2
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 8);
+
+            CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel2);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_wgs2);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_local_mem_size2);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_preferred_wgs_multiple2);
+
+            if (CL_rc == -1) return -1;
+
+            // kernel3
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 16);
+
+            CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel3);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_wgs3);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_local_mem_size3);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_preferred_wgs_multiple3);
+
+            if (CL_rc == -1) return -1;
+          }
+          else
+          {
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_mxx", kern_type);
+
+            CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel4);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_wgs4);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_local_mem_size4);
+
+            if (CL_rc == -1) return -1;
+
+            CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_preferred_wgs_multiple4);
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+
+        if (user_options->slow_candidates == true)
+        {
+        }
+        else
+        {
+          if (user_options->attack_mode == ATTACK_MODE_BF)
+          {
+            if (hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE)
+            {
+              snprintf (kernel_name, sizeof (kernel_name), "m%05u_tm", kern_type);
+
+              CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel_tm);
+
+              if (CL_rc == -1) return -1;
+
+              CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_tm, &device_param->kernel_wgs_tm);
+
+              if (CL_rc == -1) return -1;
+
+              CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_tm, &device_param->kernel_local_mem_size_tm);
+
+              if (CL_rc == -1) return -1;
+
+              CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_tm, &device_param->kernel_preferred_wgs_multiple_tm);
+
+              if (CL_rc == -1) return -1;
+            }
+          }
+        }
+      }
+      else
+      {
+        // kernel1
+
+        snprintf (kernel_name, sizeof (kernel_name), "m%05u_init", kern_type);
+
+        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel1);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_wgs1);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_local_mem_size1);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_preferred_wgs_multiple1);
+
+        if (CL_rc == -1) return -1;
+
+        // kernel2
+
+        snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop", kern_type);
+
+        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel2);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_wgs2);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_local_mem_size2);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_preferred_wgs_multiple2);
+
+        if (CL_rc == -1) return -1;
+
+        // kernel3
+
+        snprintf (kernel_name, sizeof (kernel_name), "m%05u_comp", kern_type);
+
+        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel3);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_wgs3);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_local_mem_size3);
+
+        if (CL_rc == -1) return -1;
+
+        CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_preferred_wgs_multiple3);
+
+        if (CL_rc == -1) return -1;
+
+        // kernel12
+
+        if (hashconfig->opts_type & OPTS_TYPE_HOOK12)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_hook12", kern_type);
+
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel12);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel12, &device_param->kernel_wgs12);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel12, &device_param->kernel_local_mem_size12);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel12, &device_param->kernel_preferred_wgs_multiple12);
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // kernel23
+
+        if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_hook23", kern_type);
+
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel23);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel23, &device_param->kernel_wgs23);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel23, &device_param->kernel_local_mem_size23);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel23, &device_param->kernel_preferred_wgs_multiple23);
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // init2
+
+        if (hashconfig->opts_type & OPTS_TYPE_INIT2)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_init2", kern_type);
+
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel_init2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_init2, &device_param->kernel_wgs_init2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_init2, &device_param->kernel_local_mem_size_init2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_init2, &device_param->kernel_preferred_wgs_multiple_init2);
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // loop2
+
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop2", kern_type);
+
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel_loop2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_loop2, &device_param->kernel_wgs_loop2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_loop2, &device_param->kernel_local_mem_size_loop2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_loop2, &device_param->kernel_preferred_wgs_multiple_loop2);
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // aux1
+
+        if (hashconfig->opts_type & OPTS_TYPE_AUX1)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux1", kern_type);
+
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel_aux1);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_aux1, &device_param->kernel_wgs_aux1);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux1, &device_param->kernel_local_mem_size_aux1);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_aux1, &device_param->kernel_preferred_wgs_multiple_aux1);
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // aux2
+
+        if (hashconfig->opts_type & OPTS_TYPE_AUX2)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux2", kern_type);
+
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel_aux2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_aux2, &device_param->kernel_wgs_aux2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux2, &device_param->kernel_local_mem_size_aux2);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_aux2, &device_param->kernel_preferred_wgs_multiple_aux2);
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // aux3
+
+        if (hashconfig->opts_type & OPTS_TYPE_AUX3)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux3", kern_type);
+
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel_aux3);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_aux3, &device_param->kernel_wgs_aux3);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux3, &device_param->kernel_local_mem_size_aux3);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_aux3, &device_param->kernel_preferred_wgs_multiple_aux3);
+
+          if (CL_rc == -1) return -1;
+        }
+
+        // aux4
+
+        if (hashconfig->opts_type & OPTS_TYPE_AUX4)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux4", kern_type);
+
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel_aux4);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_aux4, &device_param->kernel_wgs_aux4);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux4, &device_param->kernel_local_mem_size_aux4);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_aux4, &device_param->kernel_preferred_wgs_multiple_aux4);
+
+          if (CL_rc == -1) return -1;
+        }
+      }
+
+      // GPU memset
+
+      CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, "gpu_memset", &device_param->opencl_kernel_memset);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_wgs_memset);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_local_mem_size_memset);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_preferred_wgs_multiple_memset);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem),   device_param->kernel_params_memset[0]); if (CL_rc == -1) return -1;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1;
+
+      // GPU autotune init
+
+      CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, "gpu_atinit", &device_param->opencl_kernel_atinit);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_wgs_atinit);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_local_mem_size_atinit);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_preferred_wgs_multiple_atinit);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem),   device_param->kernel_params_atinit[0]); if (CL_rc == -1) return -1;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]); if (CL_rc == -1) return -1;
+
+      // GPU decompress
+
+      CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, "gpu_decompress", &device_param->opencl_kernel_decompress);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_wgs_decompress);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_local_mem_size_decompress);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_preferred_wgs_multiple_decompress);
+
+      if (CL_rc == -1) return -1;
+
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem),   device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem),   device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem),   device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]); if (CL_rc == -1) return -1;
+
+      // MP start
+
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        if (user_options->attack_mode == ATTACK_MODE_BF)
+        {
+          // mp_l
+
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_mp, "l_markov", &device_param->opencl_kernel_mp_l);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_mp_l, &device_param->kernel_wgs_mp_l);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp_l, &device_param->kernel_local_mem_size_mp_l);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_mp_l, &device_param->kernel_preferred_wgs_multiple_mp_l);
+
+          if (CL_rc == -1) return -1;
+
+          // mp_r
+
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_mp, "r_markov", &device_param->opencl_kernel_mp_r);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_mp_r, &device_param->kernel_wgs_mp_r);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp_r, &device_param->kernel_local_mem_size_mp_r);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_mp_r, &device_param->kernel_preferred_wgs_multiple_mp_r);
+
+          if (CL_rc == -1) return -1;
+
+          if (hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE)
+          {
+            CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_tm, 0, sizeof (cl_mem), device_param->kernel_params_tm[0]); if (CL_rc == -1) return -1;
+            CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_tm, 1, sizeof (cl_mem), device_param->kernel_params_tm[1]); if (CL_rc == -1) return -1;
+          }
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+        {
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_mp, "C_markov", &device_param->opencl_kernel_mp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_wgs_mp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_local_mem_size_mp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_preferred_wgs_multiple_mp);
+
+          if (CL_rc == -1) return -1;
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+        {
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_mp, "C_markov", &device_param->opencl_kernel_mp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_wgs_mp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_local_mem_size_mp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_preferred_wgs_multiple_mp);
+
+          if (CL_rc == -1) return -1;
+        }
+      }
+
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+        {
+          // nothing to do
+        }
+        else
+        {
+          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_amp, "amp", &device_param->opencl_kernel_amp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_amp, &device_param->kernel_wgs_amp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_amp, &device_param->kernel_local_mem_size_amp);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_amp, &device_param->kernel_preferred_wgs_multiple_amp);
+
+          if (CL_rc == -1) return -1;
+        }
+
+        if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+        {
+          // nothing to do
+        }
+        else
+        {
+          for (u32 i = 0; i < 5; i++)
+          {
+            CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, i, sizeof (cl_mem), device_param->kernel_params_amp[i]);
+
+            if (CL_rc == -1) return -1;
+          }
+
+          for (u32 i = 5; i < 6; i++)
+          {
+            CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, i, sizeof (cl_uint), device_param->kernel_params_amp[i]);
+
+            if (CL_rc == -1) return -1;
+          }
+
+          for (u32 i = 6; i < 7; i++)
+          {
+            CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, i, sizeof (cl_ulong), device_param->kernel_params_amp[i]);
+
+            if (CL_rc == -1) return -1;
+          }
+        }
+      }
+
+      // zero some data buffers
+
+      CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_plain_bufs,    device_param->size_plains);   if (CL_rc == -1) return -1;
+      CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_digests_shown, device_param->size_shown);    if (CL_rc == -1) return -1;
+      CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_result,        device_param->size_results);  if (CL_rc == -1) return -1;
+
+      /**
+       * special buffers
+       */
+
+      if (user_options->slow_candidates == true)
+      {
+        CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_rules_c, size_rules_c); if (CL_rc == -1) return -1;
+      }
+      else
+      {
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_rules_c, size_rules_c); if (CL_rc == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_combs,          size_combs);       if (CL_rc == -1) return -1;
+          CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_combs_c,        size_combs);       if (CL_rc == -1) return -1;
+          CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_root_css_buf,   size_root_css);    if (CL_rc == -1) return -1;
+          CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_markov_css_buf, size_markov_css);  if (CL_rc == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_bfs,            size_bfs);         if (CL_rc == -1) return -1;
+          CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_bfs_c,          size_bfs);         if (CL_rc == -1) return -1;
+          CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_tm_c,           size_tm);          if (CL_rc == -1) return -1;
+          CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_root_css_buf,   size_root_css);    if (CL_rc == -1) return -1;
+          CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_markov_css_buf, size_markov_css);  if (CL_rc == -1) return -1;
+        }
+      }
+
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        if ((user_options->attack_mode == ATTACK_MODE_HYBRID1) || (user_options->attack_mode == ATTACK_MODE_HYBRID2))
+        {
+          /**
+           * prepare mp
+           */
+
+          if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+          {
+            device_param->kernel_params_mp_buf32[5] = 0;
+            device_param->kernel_params_mp_buf32[6] = 0;
+            device_param->kernel_params_mp_buf32[7] = 0;
+
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)     device_param->kernel_params_mp_buf32[5] = full01;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)     device_param->kernel_params_mp_buf32[5] = full06;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)     device_param->kernel_params_mp_buf32[5] = full80;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS14) device_param->kernel_params_mp_buf32[6] = 1;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS15) device_param->kernel_params_mp_buf32[7] = 1;
+          }
+          else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+          {
+            device_param->kernel_params_mp_buf32[5] = 0;
+            device_param->kernel_params_mp_buf32[6] = 0;
+            device_param->kernel_params_mp_buf32[7] = 0;
+          }
+
+          for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, i, sizeof (cl_mem), device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_BF)
+        {
+          /**
+           * prepare mp_r and mp_l
+           */
+
+          device_param->kernel_params_mp_l_buf32[6] = 0;
+          device_param->kernel_params_mp_l_buf32[7] = 0;
+          device_param->kernel_params_mp_l_buf32[8] = 0;
+
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)     device_param->kernel_params_mp_l_buf32[6] = full01;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)     device_param->kernel_params_mp_l_buf32[6] = full06;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)     device_param->kernel_params_mp_l_buf32[6] = full80;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS14) device_param->kernel_params_mp_l_buf32[7] = 1;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS15) device_param->kernel_params_mp_l_buf32[8] = 1;
+
+          for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_mem), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+          for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_mem), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+        }
+      }
+    }
+
+    // this is required because inside the kernels there is this:
+    // __local pw_t s_pws[64];
+
+    if (user_options->attack_mode == ATTACK_MODE_STRAIGHT)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        // not required
+      }
+      else
+      {
+        device_param->kernel_threads_max = MIN (device_param->kernel_threads_max, 64);
+      }
+    }
+
+    /**
+     * now everything that depends on threads and accel, basically dynamic workload
+     */
+
+    const u32 kernel_threads = get_kernel_threads (device_param);
+
+    device_param->kernel_threads = kernel_threads;
+
+    device_param->hardware_power = device_processors * kernel_threads;
+
+    u32 kernel_accel_min = device_param->kernel_accel_min;
+    u32 kernel_accel_max = device_param->kernel_accel_max;
+
+    // find out if we would request too much memory on memory blocks which are based on kernel_accel
+
+    u64 size_pws      = 4;
+    u64 size_pws_amp  = 4;
+    u64 size_pws_comp = 4;
+    u64 size_pws_idx  = 4;
+    u64 size_pws_pre  = 4;
+    u64 size_pws_base = 4;
+    u64 size_tmps     = 4;
+    u64 size_hooks    = 4;
+    #ifdef WITH_BRAIN
+    u64 size_brain_link_in  = 4;
+    u64 size_brain_link_out = 4;
+    #endif
+
+    // instead of a thread limit we can also use a memory limit.
+    // this value should represent a reasonable amount of memory a host system has per GPU.
+    // note we're allocating 3 blocks of that size.
+
+    const u64 PWS_SPACE = 1024ull * 1024ull * 1024ull;
+
+    // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
+    // let's add some extra space just to be sure.
+
+    const u64 EXTRA_SPACE = 64ull * 1024ull * 1024ull;
+
+    while (kernel_accel_max >= kernel_accel_min)
+    {
+      const u64 kernel_power_max = device_param->hardware_power * kernel_accel_max;
+
+      // size_pws
+
+      size_pws = (u64) kernel_power_max * sizeof (pw_t);
+
+      size_pws_amp = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) ? 1 : size_pws;
+
+      // size_pws_comp
+
+      size_pws_comp = (u64) kernel_power_max * (sizeof (u32) * 64);
+
+      // size_pws_idx
+
+      size_pws_idx = (u64) (kernel_power_max + 1) * sizeof (pw_idx_t);
+
+      // size_tmps
+
+      size_tmps = (u64) kernel_power_max * (hashconfig->tmp_size + hashconfig->extra_tmp_size);
+
+      // size_hooks
+
+      size_hooks = (u64) kernel_power_max * hashconfig->hook_size;
+
+      #ifdef WITH_BRAIN
+      // size_brains
+
+      size_brain_link_in  = (u64) kernel_power_max * 1;
+      size_brain_link_out = (u64) kernel_power_max * 8;
+      #endif
+
+      if (user_options->slow_candidates == true)
+      {
+        // size_pws_pre
+
+        size_pws_pre = (u64) kernel_power_max * sizeof (pw_pre_t);
+
+        // size_pws_base
+
+        size_pws_base = (u64) kernel_power_max * sizeof (pw_pre_t);
+      }
+
+      // now check if all device-memory sizes which depend on the kernel_accel_max amplifier are within its boundaries
+      // if not, decrease amplifier and try again
+
+      int memory_limit_hit = 0;
+
+      if (size_pws > PWS_SPACE) memory_limit_hit = 1;
+
+      if ((size_pws   + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
+      if ((size_tmps  + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
+      if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
+
+      const u64 size_total
+        = bitmap_ctx->bitmap_size
+        + bitmap_ctx->bitmap_size
+        + bitmap_ctx->bitmap_size
+        + bitmap_ctx->bitmap_size
+        + bitmap_ctx->bitmap_size
+        + bitmap_ctx->bitmap_size
+        + bitmap_ctx->bitmap_size
+        + bitmap_ctx->bitmap_size
+        + size_bfs
+        + size_combs
+        + size_digests
+        + size_esalts
+        + size_hooks
+        + size_markov_css
+        + size_plains
+        + size_pws
+        + size_pws_amp
+        + size_pws_comp
+        + size_pws_idx
+        + size_results
+        + size_root_css
+        + size_rules
+        + size_rules_c
+        + size_salts
+        + size_extra_buffer
+        + size_shown
+        + size_tm
+        + size_tmps
+        + size_st_digests
+        + size_st_salts
+        + size_st_esalts;
+
+      if ((size_total + EXTRA_SPACE) > device_param->device_available_mem) memory_limit_hit = 1;
+
+      if (memory_limit_hit == 1)
+      {
+        kernel_accel_max--;
+
+        continue;
+      }
+
+      const u64 size_total_host
+        = size_pws_comp
+        + size_pws_idx
+        + size_hooks
+        #ifdef WITH_BRAIN
+        + size_brain_link_in
+        + size_brain_link_out
+        #endif
+        + size_pws_pre
+        + size_pws_base;
+
+      size_total_host_all += size_total_host + EXTRA_SPACE;
+
+      break;
+    }
+
+    if (kernel_accel_max < kernel_accel_min)
+    {
+      event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory for this attack.", device_id + 1);
+
+      return -1;
+    }
+
+    device_param->kernel_accel_min = kernel_accel_min;
+    device_param->kernel_accel_max = kernel_accel_max;
+
+    device_param->size_pws      = size_pws;
+    device_param->size_pws_amp  = size_pws_amp;
+    device_param->size_pws_comp = size_pws_comp;
+    device_param->size_pws_idx  = size_pws_idx;
+    device_param->size_pws_pre  = size_pws_pre;
+    device_param->size_pws_base = size_pws_base;
+    device_param->size_tmps     = size_tmps;
+    device_param->size_hooks    = size_hooks;
+    #ifdef WITH_BRAIN
+    device_param->size_brain_link_in  = size_brain_link_in;
+    device_param->size_brain_link_out = size_brain_link_out;
+    #endif
+
+    if (device_param->is_cuda == true)
+    {
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_pws_buf,      size_pws);      if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_pws_amp_buf,  size_pws_amp);  if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_pws_comp_buf, size_pws_comp); if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_pws_idx,      size_pws_idx);  if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_tmps,         size_tmps);     if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_hooks,        size_hooks);    if (CU_rc == -1) return -1;
+
+      CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_pws_buf,       device_param->size_pws);      if (CU_rc == -1) return -1;
+      CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_pws_amp_buf,   device_param->size_pws_amp);  if (CU_rc == -1) return -1;
+      CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_pws_comp_buf,  device_param->size_pws_comp); if (CU_rc == -1) return -1;
+      CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_pws_idx,       device_param->size_pws_idx);  if (CU_rc == -1) return -1;
+      CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tmps,          device_param->size_tmps);     if (CU_rc == -1) return -1;
+      CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_hooks,         device_param->size_hooks);    if (CU_rc == -1) return -1;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_pws,      NULL, &device_param->opencl_d_pws_buf);      if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_pws_amp,  NULL, &device_param->opencl_d_pws_amp_buf);  if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_pws_comp, NULL, &device_param->opencl_d_pws_comp_buf); if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_pws_idx,  NULL, &device_param->opencl_d_pws_idx);      if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_tmps,     NULL, &device_param->opencl_d_tmps);         if (CL_rc == -1) return -1;
+      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_hooks,    NULL, &device_param->opencl_d_hooks);        if (CL_rc == -1) return -1;
+
+      CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_pws_buf,       device_param->size_pws);      if (CL_rc == -1) return -1;
+      CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_pws_amp_buf,   device_param->size_pws_amp);  if (CL_rc == -1) return -1;
+      CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_pws_comp_buf,  device_param->size_pws_comp); if (CL_rc == -1) return -1;
+      CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_pws_idx,       device_param->size_pws_idx);  if (CL_rc == -1) return -1;
+      CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_tmps,          device_param->size_tmps);     if (CL_rc == -1) return -1;
+      CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_hooks,         device_param->size_hooks);    if (CL_rc == -1) return -1;
+    }
+
+    /**
+     * main host data
+     */
+
+    u32 *pws_comp = (u32 *) hcmalloc (size_pws_comp);
+
+    device_param->pws_comp = pws_comp;
+
+    pw_idx_t *pws_idx = (pw_idx_t *) hcmalloc (size_pws_idx);
+
+    device_param->pws_idx = pws_idx;
+
+    pw_t *combs_buf = (pw_t *) hccalloc (KERNEL_COMBS, sizeof (pw_t));
+
+    device_param->combs_buf = combs_buf;
+
+    void *hooks_buf = hcmalloc (size_hooks);
+
+    device_param->hooks_buf = hooks_buf;
+
+    char *scratch_buf = (char *) hcmalloc (HCBUFSIZ_LARGE);
+
+    device_param->scratch_buf = scratch_buf;
+
+    #ifdef WITH_BRAIN
+
+    u8 *brain_link_in_buf = (u8 *) hcmalloc (size_brain_link_in);
+
+    device_param->brain_link_in_buf = brain_link_in_buf;
+
+    u32 *brain_link_out_buf = (u32 *) hcmalloc (size_brain_link_out);
+
+    device_param->brain_link_out_buf = brain_link_out_buf;
+    #endif
+
+    pw_pre_t *pws_pre_buf = (pw_pre_t *) hcmalloc (size_pws_pre);
+
+    device_param->pws_pre_buf = pws_pre_buf;
+
+    pw_pre_t *pws_base_buf = (pw_pre_t *) hcmalloc (size_pws_base);
+
+    device_param->pws_base_buf = pws_base_buf;
+
+    /**
+     * kernel args
+     */
+
+    if (device_param->is_cuda == true)
+    {
+      device_param->kernel_params[ 0] = &device_param->cuda_d_pws_buf;
+      device_param->kernel_params[ 4] = &device_param->cuda_d_tmps;
+      device_param->kernel_params[ 5] = &device_param->cuda_d_hooks;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      device_param->kernel_params[ 0] = &device_param->opencl_d_pws_buf;
+      device_param->kernel_params[ 4] = &device_param->opencl_d_tmps;
+      device_param->kernel_params[ 5] = &device_param->opencl_d_hooks;
+    }
+
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        // nothing to do
+      }
+      else
+      {
+        if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+        {
+          if (device_param->is_cuda == true)
+          {
+            device_param->kernel_params_mp[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                              ? &device_param->cuda_d_pws_buf
+                                              : &device_param->cuda_d_pws_amp_buf;
+
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, 0, sizeof (cl_mem), device_param->kernel_params_mp[0]); if (CL_rc == -1) return -1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            device_param->kernel_params_mp[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                              ? &device_param->opencl_d_pws_buf
+                                              : &device_param->opencl_d_pws_amp_buf;
+
+            CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, 0, sizeof (cl_mem), device_param->kernel_params_mp[0]); if (CL_rc == -1) return -1;
+          }
+        }
+      }
+
+      if (user_options->attack_mode == ATTACK_MODE_BF)
+      {
+        if (device_param->is_cuda == true)
+        {
+          device_param->kernel_params_mp_l[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                              ? &device_param->cuda_d_pws_buf
+                                              : &device_param->cuda_d_pws_amp_buf;
+
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, 0, sizeof (cl_mem), device_param->kernel_params_mp_l[0]); if (CL_rc == -1) return -1;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          device_param->kernel_params_mp_l[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                              ? &device_param->opencl_d_pws_buf
+                                              : &device_param->opencl_d_pws_amp_buf;
+
+          CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, 0, sizeof (cl_mem), device_param->kernel_params_mp_l[0]); if (CL_rc == -1) return -1;
+        }
+      }
+
+      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+      {
+        // nothing to do
+      }
+      else
+      {
+        if (device_param->is_cuda == true)
+        {
+          device_param->kernel_params_amp[0] = &device_param->cuda_d_pws_buf;
+          device_param->kernel_params_amp[1] = &device_param->cuda_d_pws_amp_buf;
+
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 0, sizeof (cl_mem), device_param->kernel_params_amp[0]); if (CL_rc == -1) return -1;
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 1, sizeof (cl_mem), device_param->kernel_params_amp[1]); if (CL_rc == -1) return -1;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          device_param->kernel_params_amp[0] = &device_param->opencl_d_pws_buf;
+          device_param->kernel_params_amp[1] = &device_param->opencl_d_pws_amp_buf;
+
+          CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 0, sizeof (cl_mem), device_param->kernel_params_amp[0]); if (CL_rc == -1) return -1;
+          CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 1, sizeof (cl_mem), device_param->kernel_params_amp[1]); if (CL_rc == -1) return -1;
+        }
+      }
+    }
+
+    if (device_param->is_cuda == true)
+    {
+      device_param->kernel_params_decompress[0] = &device_param->cuda_d_pws_idx;
+      device_param->kernel_params_decompress[1] = &device_param->cuda_d_pws_comp_buf;
+      device_param->kernel_params_decompress[2] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                ? &device_param->cuda_d_pws_buf
+                                                : &device_param->cuda_d_pws_amp_buf;
+
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem), device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      device_param->kernel_params_decompress[0] = &device_param->opencl_d_pws_idx;
+      device_param->kernel_params_decompress[1] = &device_param->opencl_d_pws_comp_buf;
+      device_param->kernel_params_decompress[2] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                ? &device_param->opencl_d_pws_buf
+                                                : &device_param->opencl_d_pws_amp_buf;
+
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem), device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1;
+    }
+
+    hardware_power_all += device_param->hardware_power;
+
+    EVENT_DATA (EVENT_BACKEND_DEVICE_INIT_POST, &backend_devices_idx, sizeof (int));
+  }
+
+  if (user_options->benchmark == false)
+  {
+    if (hardware_power_all == 0) return -1;
+  }
+
+  backend_ctx->hardware_power_all = hardware_power_all;
+
+  EVENT_DATA (EVENT_BACKEND_SESSION_HOSTMEM, &size_total_host_all, sizeof (u64));
+
+  return 0;
+}
+
+void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (backend_ctx->enabled == false) return;
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    hcfree (device_param->pws_comp);
+    hcfree (device_param->pws_idx);
+    hcfree (device_param->pws_pre_buf);
+    hcfree (device_param->pws_base_buf);
+    hcfree (device_param->combs_buf);
+    hcfree (device_param->hooks_buf);
+    hcfree (device_param->scratch_buf);
+    #ifdef WITH_BRAIN
+    hcfree (device_param->brain_link_in_buf);
+    hcfree (device_param->brain_link_out_buf);
+    #endif
+
+    if (device_param->is_cuda == true)
+    {
+      if (device_param->cuda_d_pws_buf)        hc_cuMemFree (hashcat_ctx, device_param->cuda_d_pws_buf);
+      if (device_param->cuda_d_pws_amp_buf)    hc_cuMemFree (hashcat_ctx, device_param->cuda_d_pws_amp_buf);
+      if (device_param->cuda_d_pws_comp_buf)   hc_cuMemFree (hashcat_ctx, device_param->cuda_d_pws_comp_buf);
+      if (device_param->cuda_d_pws_idx)        hc_cuMemFree (hashcat_ctx, device_param->cuda_d_pws_idx);
+      if (device_param->cuda_d_rules)          hc_cuMemFree (hashcat_ctx, device_param->cuda_d_rules);
+      //if (device_param->cuda_d_rules_c)        hc_cuMemFree (hashcat_ctx, device_param->cuda_d_rules_c);
+      if (device_param->cuda_d_combs)          hc_cuMemFree (hashcat_ctx, device_param->cuda_d_combs);
+      if (device_param->cuda_d_combs_c)        hc_cuMemFree (hashcat_ctx, device_param->cuda_d_combs_c);
+      if (device_param->cuda_d_bfs)            hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bfs);
+      //if (device_param->cuda_d_bfs_c)          hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bfs_c);
+      if (device_param->cuda_d_bitmap_s1_a)    hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bitmap_s1_a);
+      if (device_param->cuda_d_bitmap_s1_b)    hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bitmap_s1_b);
+      if (device_param->cuda_d_bitmap_s1_c)    hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bitmap_s1_c);
+      if (device_param->cuda_d_bitmap_s1_d)    hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bitmap_s1_d);
+      if (device_param->cuda_d_bitmap_s2_a)    hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bitmap_s2_a);
+      if (device_param->cuda_d_bitmap_s2_b)    hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bitmap_s2_b);
+      if (device_param->cuda_d_bitmap_s2_c)    hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bitmap_s2_c);
+      if (device_param->cuda_d_bitmap_s2_d)    hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bitmap_s2_d);
+      if (device_param->cuda_d_plain_bufs)     hc_cuMemFree (hashcat_ctx, device_param->cuda_d_plain_bufs);
+      if (device_param->cuda_d_digests_buf)    hc_cuMemFree (hashcat_ctx, device_param->cuda_d_digests_buf);
+      if (device_param->cuda_d_digests_shown)  hc_cuMemFree (hashcat_ctx, device_param->cuda_d_digests_shown);
+      if (device_param->cuda_d_salt_bufs)      hc_cuMemFree (hashcat_ctx, device_param->cuda_d_salt_bufs);
+      if (device_param->cuda_d_esalt_bufs)     hc_cuMemFree (hashcat_ctx, device_param->cuda_d_esalt_bufs);
+      if (device_param->cuda_d_tmps)           hc_cuMemFree (hashcat_ctx, device_param->cuda_d_tmps);
+      if (device_param->cuda_d_hooks)          hc_cuMemFree (hashcat_ctx, device_param->cuda_d_hooks);
+      if (device_param->cuda_d_result)         hc_cuMemFree (hashcat_ctx, device_param->cuda_d_result);
+      if (device_param->cuda_d_extra0_buf)     hc_cuMemFree (hashcat_ctx, device_param->cuda_d_extra0_buf);
+      if (device_param->cuda_d_extra1_buf)     hc_cuMemFree (hashcat_ctx, device_param->cuda_d_extra1_buf);
+      if (device_param->cuda_d_extra2_buf)     hc_cuMemFree (hashcat_ctx, device_param->cuda_d_extra2_buf);
+      if (device_param->cuda_d_extra3_buf)     hc_cuMemFree (hashcat_ctx, device_param->cuda_d_extra3_buf);
+      if (device_param->cuda_d_root_css_buf)   hc_cuMemFree (hashcat_ctx, device_param->cuda_d_root_css_buf);
+      if (device_param->cuda_d_markov_css_buf) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_markov_css_buf);
+      //if (device_param->cuda_d_tm_c)           hc_cuMemFree (hashcat_ctx, device_param->cuda_d_tm_c);
+      if (device_param->cuda_d_st_digests_buf) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_st_digests_buf);
+      if (device_param->cuda_d_st_salts_buf)   hc_cuMemFree (hashcat_ctx, device_param->cuda_d_st_salts_buf);
+      if (device_param->cuda_d_st_esalts_buf)  hc_cuMemFree (hashcat_ctx, device_param->cuda_d_st_esalts_buf);
+
+      if (device_param->cuda_event1)           hc_cuEventDestroy (hashcat_ctx, device_param->cuda_event1);
+      if (device_param->cuda_event2)           hc_cuEventDestroy (hashcat_ctx, device_param->cuda_event2);
+
+      if (device_param->cuda_stream)           hc_cuStreamDestroy (hashcat_ctx, device_param->cuda_stream);
+
+      if (device_param->cuda_module)           hc_cuModuleUnload (hashcat_ctx, device_param->cuda_module);
+      if (device_param->cuda_module_mp)        hc_cuModuleUnload (hashcat_ctx, device_param->cuda_module_mp);
+      if (device_param->cuda_module_amp)       hc_cuModuleUnload (hashcat_ctx, device_param->cuda_module_amp);
+
+      if (device_param->cuda_context)          hc_cuCtxDestroy (hashcat_ctx, device_param->cuda_context);
+
+      device_param->cuda_d_pws_buf            = 0;
+      device_param->cuda_d_pws_amp_buf        = 0;
+      device_param->cuda_d_pws_comp_buf       = 0;
+      device_param->cuda_d_pws_idx            = 0;
+      device_param->cuda_d_rules              = 0;
+      device_param->cuda_d_rules_c            = 0;
+      device_param->cuda_d_combs              = 0;
+      device_param->cuda_d_combs_c            = 0;
+      device_param->cuda_d_bfs                = 0;
+      device_param->cuda_d_bfs_c              = 0;
+      device_param->cuda_d_bitmap_s1_a        = 0;
+      device_param->cuda_d_bitmap_s1_b        = 0;
+      device_param->cuda_d_bitmap_s1_c        = 0;
+      device_param->cuda_d_bitmap_s1_d        = 0;
+      device_param->cuda_d_bitmap_s2_a        = 0;
+      device_param->cuda_d_bitmap_s2_b        = 0;
+      device_param->cuda_d_bitmap_s2_c        = 0;
+      device_param->cuda_d_bitmap_s2_d        = 0;
+      device_param->cuda_d_plain_bufs         = 0;
+      device_param->cuda_d_digests_buf        = 0;
+      device_param->cuda_d_digests_shown      = 0;
+      device_param->cuda_d_salt_bufs          = 0;
+      device_param->cuda_d_esalt_bufs         = 0;
+      device_param->cuda_d_tmps               = 0;
+      device_param->cuda_d_hooks              = 0;
+      device_param->cuda_d_result             = 0;
+      device_param->cuda_d_extra0_buf         = 0;
+      device_param->cuda_d_extra1_buf         = 0;
+      device_param->cuda_d_extra2_buf         = 0;
+      device_param->cuda_d_extra3_buf         = 0;
+      device_param->cuda_d_root_css_buf       = 0;
+      device_param->cuda_d_markov_css_buf     = 0;
+      device_param->cuda_d_tm_c               = 0;
+      device_param->cuda_d_st_digests_buf     = 0;
+      device_param->cuda_d_st_salts_buf       = 0;
+      device_param->cuda_d_st_esalts_buf      = 0;
+
+      device_param->cuda_function1            = NULL;
+      device_param->cuda_function12           = NULL;
+      device_param->cuda_function2            = NULL;
+      device_param->cuda_function23           = NULL;
+      device_param->cuda_function3            = NULL;
+      device_param->cuda_function4            = NULL;
+      device_param->cuda_function_init2       = NULL;
+      device_param->cuda_function_loop2       = NULL;
+      device_param->cuda_function_mp          = NULL;
+      device_param->cuda_function_mp_l        = NULL;
+      device_param->cuda_function_mp_r        = NULL;
+      device_param->cuda_function_tm          = NULL;
+      device_param->cuda_function_amp         = NULL;
+      device_param->cuda_function_memset      = NULL;
+      device_param->cuda_function_atinit      = NULL;
+      device_param->cuda_function_decompress  = NULL;
+      device_param->cuda_function_aux1        = NULL;
+      device_param->cuda_function_aux2        = NULL;
+      device_param->cuda_function_aux3        = NULL;
+      device_param->cuda_function_aux4        = NULL;
+
+      device_param->cuda_module               = NULL;
+      device_param->cuda_module_mp            = NULL;
+      device_param->cuda_module_amp           = NULL;
+
+      device_param->cuda_context              = NULL;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      if (device_param->opencl_d_pws_buf)        hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_pws_buf);
+      if (device_param->opencl_d_pws_amp_buf)    hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_pws_amp_buf);
+      if (device_param->opencl_d_pws_comp_buf)   hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_pws_comp_buf);
+      if (device_param->opencl_d_pws_idx)        hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_pws_idx);
+      if (device_param->opencl_d_rules)          hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_rules);
+      if (device_param->opencl_d_rules_c)        hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_rules_c);
+      if (device_param->opencl_d_combs)          hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_combs);
+      if (device_param->opencl_d_combs_c)        hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_combs_c);
+      if (device_param->opencl_d_bfs)            hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_bfs);
+      if (device_param->opencl_d_bfs_c)          hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_bfs_c);
+      if (device_param->opencl_d_bitmap_s1_a)    hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_bitmap_s1_a);
+      if (device_param->opencl_d_bitmap_s1_b)    hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_bitmap_s1_b);
+      if (device_param->opencl_d_bitmap_s1_c)    hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_bitmap_s1_c);
+      if (device_param->opencl_d_bitmap_s1_d)    hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_bitmap_s1_d);
+      if (device_param->opencl_d_bitmap_s2_a)    hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_bitmap_s2_a);
+      if (device_param->opencl_d_bitmap_s2_b)    hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_bitmap_s2_b);
+      if (device_param->opencl_d_bitmap_s2_c)    hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_bitmap_s2_c);
+      if (device_param->opencl_d_bitmap_s2_d)    hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_bitmap_s2_d);
+      if (device_param->opencl_d_plain_bufs)     hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_plain_bufs);
+      if (device_param->opencl_d_digests_buf)    hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_digests_buf);
+      if (device_param->opencl_d_digests_shown)  hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_digests_shown);
+      if (device_param->opencl_d_salt_bufs)      hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_salt_bufs);
+      if (device_param->opencl_d_esalt_bufs)     hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_esalt_bufs);
+      if (device_param->opencl_d_tmps)           hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_tmps);
+      if (device_param->opencl_d_hooks)          hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_hooks);
+      if (device_param->opencl_d_result)         hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_result);
+      if (device_param->opencl_d_extra0_buf)     hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_extra0_buf);
+      if (device_param->opencl_d_extra1_buf)     hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_extra1_buf);
+      if (device_param->opencl_d_extra2_buf)     hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_extra2_buf);
+      if (device_param->opencl_d_extra3_buf)     hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_extra3_buf);
+      if (device_param->opencl_d_root_css_buf)   hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_root_css_buf);
+      if (device_param->opencl_d_markov_css_buf) hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_markov_css_buf);
+      if (device_param->opencl_d_tm_c)           hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_tm_c);
+      if (device_param->opencl_d_st_digests_buf) hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_st_digests_buf);
+      if (device_param->opencl_d_st_salts_buf)   hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_st_salts_buf);
+      if (device_param->opencl_d_st_esalts_buf)  hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_st_esalts_buf);
+
+      if (device_param->opencl_kernel1)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel1);
+      if (device_param->opencl_kernel12)         hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel12);
+      if (device_param->opencl_kernel2)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel2);
+      if (device_param->opencl_kernel23)         hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel23);
+      if (device_param->opencl_kernel3)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel3);
+      if (device_param->opencl_kernel4)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel4);
+      if (device_param->opencl_kernel_init2)     hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_init2);
+      if (device_param->opencl_kernel_loop2)     hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_loop2);
+      if (device_param->opencl_kernel_mp)        hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_mp);
+      if (device_param->opencl_kernel_mp_l)      hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_mp_l);
+      if (device_param->opencl_kernel_mp_r)      hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_mp_r);
+      if (device_param->opencl_kernel_tm)        hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_tm);
+      if (device_param->opencl_kernel_amp)       hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_amp);
+      if (device_param->opencl_kernel_memset)    hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_memset);
+      if (device_param->opencl_kernel_atinit)    hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_atinit);
+      if (device_param->opencl_kernel_decompress)hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_decompress);
+      if (device_param->opencl_kernel_aux1)      hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_aux1);
+      if (device_param->opencl_kernel_aux2)      hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_aux2);
+      if (device_param->opencl_kernel_aux3)      hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_aux3);
+      if (device_param->opencl_kernel_aux4)      hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_aux4);
+
+      if (device_param->opencl_program)          hc_clReleaseProgram (hashcat_ctx, device_param->opencl_program);
+      if (device_param->opencl_program_mp)       hc_clReleaseProgram (hashcat_ctx, device_param->opencl_program_mp);
+      if (device_param->opencl_program_amp)      hc_clReleaseProgram (hashcat_ctx, device_param->opencl_program_amp);
+
+      if (device_param->opencl_command_queue)    hc_clReleaseCommandQueue (hashcat_ctx, device_param->opencl_command_queue);
+
+      if (device_param->opencl_context)          hc_clReleaseContext (hashcat_ctx, device_param->opencl_context);
+
+      device_param->opencl_d_pws_buf           = NULL;
+      device_param->opencl_d_pws_amp_buf       = NULL;
+      device_param->opencl_d_pws_comp_buf      = NULL;
+      device_param->opencl_d_pws_idx           = NULL;
+      device_param->opencl_d_rules             = NULL;
+      device_param->opencl_d_rules_c           = NULL;
+      device_param->opencl_d_combs             = NULL;
+      device_param->opencl_d_combs_c           = NULL;
+      device_param->opencl_d_bfs               = NULL;
+      device_param->opencl_d_bfs_c             = NULL;
+      device_param->opencl_d_bitmap_s1_a       = NULL;
+      device_param->opencl_d_bitmap_s1_b       = NULL;
+      device_param->opencl_d_bitmap_s1_c       = NULL;
+      device_param->opencl_d_bitmap_s1_d       = NULL;
+      device_param->opencl_d_bitmap_s2_a       = NULL;
+      device_param->opencl_d_bitmap_s2_b       = NULL;
+      device_param->opencl_d_bitmap_s2_c       = NULL;
+      device_param->opencl_d_bitmap_s2_d       = NULL;
+      device_param->opencl_d_plain_bufs        = NULL;
+      device_param->opencl_d_digests_buf       = NULL;
+      device_param->opencl_d_digests_shown     = NULL;
+      device_param->opencl_d_salt_bufs         = NULL;
+      device_param->opencl_d_esalt_bufs        = NULL;
+      device_param->opencl_d_tmps              = NULL;
+      device_param->opencl_d_hooks             = NULL;
+      device_param->opencl_d_result            = NULL;
+      device_param->opencl_d_extra0_buf        = NULL;
+      device_param->opencl_d_extra1_buf        = NULL;
+      device_param->opencl_d_extra2_buf        = NULL;
+      device_param->opencl_d_extra3_buf        = NULL;
+      device_param->opencl_d_root_css_buf      = NULL;
+      device_param->opencl_d_markov_css_buf    = NULL;
+      device_param->opencl_d_tm_c              = NULL;
+      device_param->opencl_d_st_digests_buf    = NULL;
+      device_param->opencl_d_st_salts_buf      = NULL;
+      device_param->opencl_d_st_esalts_buf     = NULL;
+      device_param->opencl_kernel1             = NULL;
+      device_param->opencl_kernel12            = NULL;
+      device_param->opencl_kernel2             = NULL;
+      device_param->opencl_kernel23            = NULL;
+      device_param->opencl_kernel3             = NULL;
+      device_param->opencl_kernel4             = NULL;
+      device_param->opencl_kernel_init2        = NULL;
+      device_param->opencl_kernel_loop2        = NULL;
+      device_param->opencl_kernel_mp           = NULL;
+      device_param->opencl_kernel_mp_l         = NULL;
+      device_param->opencl_kernel_mp_r         = NULL;
+      device_param->opencl_kernel_tm           = NULL;
+      device_param->opencl_kernel_amp          = NULL;
+      device_param->opencl_kernel_memset       = NULL;
+      device_param->opencl_kernel_atinit       = NULL;
+      device_param->opencl_kernel_decompress   = NULL;
+      device_param->opencl_kernel_aux1         = NULL;
+      device_param->opencl_kernel_aux2         = NULL;
+      device_param->opencl_kernel_aux3         = NULL;
+      device_param->opencl_kernel_aux4         = NULL;
+      device_param->opencl_program             = NULL;
+      device_param->opencl_program_mp          = NULL;
+      device_param->opencl_program_amp         = NULL;
+      device_param->opencl_command_queue       = NULL;
+      device_param->opencl_context             = NULL;
+    }
+
+    device_param->pws_comp            = NULL;
+    device_param->pws_idx             = NULL;
+    device_param->pws_pre_buf         = NULL;
+    device_param->pws_base_buf        = NULL;
+    device_param->combs_buf           = NULL;
+    device_param->hooks_buf           = NULL;
+    device_param->scratch_buf         = NULL;
+    #ifdef WITH_BRAIN
+    device_param->brain_link_in_buf   = NULL;
+    device_param->brain_link_out_buf  = NULL;
+    #endif
+  }
+}
+
+void backend_session_reset (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (backend_ctx->enabled == false) return;
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    device_param->speed_pos = 0;
+
+    memset (device_param->speed_cnt,  0, SPEED_CACHE * sizeof (u64));
+    memset (device_param->speed_msec, 0, SPEED_CACHE * sizeof (double));
+
+    device_param->speed_only_finish = false;
+
+    device_param->exec_pos = 0;
+
+    memset (device_param->exec_msec, 0, EXEC_CACHE * sizeof (double));
+
+    device_param->outerloop_msec = 0;
+    device_param->outerloop_pos  = 0;
+    device_param->outerloop_left = 0;
+    device_param->innerloop_pos  = 0;
+    device_param->innerloop_left = 0;
+
+    // some more resets:
+
+    if (device_param->pws_comp) memset (device_param->pws_comp, 0, device_param->size_pws_comp);
+    if (device_param->pws_idx)  memset (device_param->pws_idx,  0, device_param->size_pws_idx);
+
+    device_param->pws_cnt = 0;
+
+    device_param->words_off  = 0;
+    device_param->words_done = 0;
+
+    #if defined (_WIN)
+    device_param->timer_speed.QuadPart = 0;
+    #else
+    device_param->timer_speed.tv_sec = 0;
+    #endif
+  }
+
+  backend_ctx->kernel_power_all   = 0;
+  backend_ctx->kernel_power_final = 0;
+}
+
+int backend_session_update_combinator (hashcat_ctx_t *hashcat_ctx)
+{
+  combinator_ctx_t *combinator_ctx = hashcat_ctx->combinator_ctx;
+  hashconfig_t     *hashconfig     = hashcat_ctx->hashconfig;
+  backend_ctx_t    *backend_ctx    = hashcat_ctx->backend_ctx;
+  user_options_t   *user_options   = hashcat_ctx->user_options;
+
+  if (backend_ctx->enabled == false) return 0;
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    if (device_param->skipped_warning == true) continue;
+
+    // kernel_params
+
+    device_param->kernel_params_buf32[33] = combinator_ctx->combs_mode;
+
+    /*
+    if (device_param->is_opencl == true)
+    {
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel1, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel2, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel3, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel4, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1;
+
+      if (hashconfig->opts_type & OPTS_TYPE_HOOK12) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel12,     33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1; }
+      if (hashconfig->opts_type & OPTS_TYPE_HOOK23) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel23,     33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1; }
+      if (hashconfig->opts_type & OPTS_TYPE_INIT2)  { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_init2, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1; }
+      if (hashconfig->opts_type & OPTS_TYPE_LOOP2)  { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_loop2, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1; }
+    }
+    */
+
+    // kernel_params_amp
+
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      device_param->kernel_params_amp_buf32[5] = combinator_ctx->combs_mode;
+
+      if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+      {
+        if (device_param->is_opencl == true)
+        {
+          const int rc_clSetKernelArg = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 5, sizeof (cl_uint), device_param->kernel_params_amp[5]);
+
+          if (rc_clSetKernelArg == -1) return -1;
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+int backend_session_update_mp (hashcat_ctx_t *hashcat_ctx)
+{
+  mask_ctx_t     *mask_ctx     = hashcat_ctx->mask_ctx;
+  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
+  user_options_t *user_options = hashcat_ctx->user_options;
+
+  if (backend_ctx->enabled == false) return 0;
+
+  if (user_options->slow_candidates == true) return 0;
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    if (device_param->skipped_warning == true) continue;
+
+    device_param->kernel_params_mp_buf64[3] = 0;
+    device_param->kernel_params_mp_buf32[4] = mask_ctx->css_cnt;
+
+    if (device_param->is_cuda == true)
+    {
+      int CU_rc;
+
+      //for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, i, sizeof (cl_ulong), device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
+      //for (u32 i = 4; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, i, sizeof (cl_uint),  device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
+
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css);   if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css); if (CU_rc == -1) return -1;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      int CL_rc;
+
+      for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, i, sizeof (cl_ulong), device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
+      for (u32 i = 4; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, i, sizeof (cl_uint),  device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
+
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_root_css_buf,   CL_TRUE, 0, device_param->size_root_css,   mask_ctx->root_css_buf,   0, NULL, NULL); if (CL_rc == -1) return -1;
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_markov_css_buf, CL_TRUE, 0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL); if (CL_rc == -1) return -1;
+    }
+  }
+
+  return 0;
+}
+
+int backend_session_update_mp_rl (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_l, const u32 css_cnt_r)
+{
+  mask_ctx_t     *mask_ctx     = hashcat_ctx->mask_ctx;
+  backend_ctx_t   *backend_ctx   = hashcat_ctx->backend_ctx;
+  user_options_t *user_options = hashcat_ctx->user_options;
+
+  if (backend_ctx->enabled == false) return 0;
+
+  if (user_options->slow_candidates == true) return 0;
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    if (device_param->skipped_warning == true) continue;
+
+    device_param->kernel_params_mp_l_buf64[3] = 0;
+    device_param->kernel_params_mp_l_buf32[4] = css_cnt_l;
+    device_param->kernel_params_mp_l_buf32[5] = css_cnt_r;
+
+    device_param->kernel_params_mp_r_buf64[3] = 0;
+    device_param->kernel_params_mp_r_buf32[4] = css_cnt_r;
+
+    if (device_param->is_cuda == true)
+    {
+      int CU_rc;
+
+      //for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_ulong), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+      //for (u32 i = 4; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_uint),  device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+      //for (u32 i = 9; i < 9; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_ulong), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+
+      //for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_ulong), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+      //for (u32 i = 4; i < 7; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_uint),  device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+      //for (u32 i = 8; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_ulong), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css);   if (CU_rc == -1) return -1;
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css); if (CU_rc == -1) return -1;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      int CL_rc = CL_SUCCESS;
+
+      for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_ulong), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+      for (u32 i = 4; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_uint),  device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+      for (u32 i = 9; i < 9; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_ulong), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+
+      for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_ulong), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+      for (u32 i = 4; i < 7; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_uint),  device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+      for (u32 i = 8; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_ulong), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_root_css_buf,   CL_TRUE, 0, device_param->size_root_css,   mask_ctx->root_css_buf,   0, NULL, NULL); if (CL_rc == -1) return -1;
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_markov_css_buf, CL_TRUE, 0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL); if (CL_rc == -1) return -1;
+    }
+  }
+
+  return 0;
+}
diff --git a/src/bitmap.c b/src/bitmap.c
index 66f9ebd0f..6b1da1362 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -82,7 +82,7 @@ int bitmap_ctx_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->example_hashes == true) return 0;
   if (user_options->keyspace       == true) return 0;
   if (user_options->left           == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
   if (user_options->show           == true) return 0;
   if (user_options->usage          == true) return 0;
   if (user_options->version        == true) return 0;
diff --git a/src/combinator.c b/src/combinator.c
index ccf7b4792..a0c3f16eb 100644
--- a/src/combinator.c
+++ b/src/combinator.c
@@ -21,7 +21,7 @@ int combinator_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->example_hashes == true) return 0;
   if (user_options->left           == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
   if (user_options->show           == true) return 0;
   if (user_options->usage          == true) return 0;
   if (user_options->version        == true) return 0;
diff --git a/src/convert.c b/src/convert.c
index c9426f617..bed989ae4 100644
--- a/src/convert.c
+++ b/src/convert.c
@@ -845,7 +845,7 @@ u8 v8a_from_v32 (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v8a;
+  return v.v8.a;
 }
 
 u8 v8b_from_v32 (const u32 v32)
@@ -854,7 +854,7 @@ u8 v8b_from_v32 (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v8b;
+  return v.v8.b;
 }
 
 u8 v8c_from_v32 (const u32 v32)
@@ -863,7 +863,7 @@ u8 v8c_from_v32 (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v8c;
+  return v.v8.c;
 }
 
 u8 v8d_from_v32 (const u32 v32)
@@ -872,7 +872,7 @@ u8 v8d_from_v32 (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v8d;
+  return v.v8.d;
 }
 
 u16 v16a_from_v32 (const u32 v32)
@@ -881,7 +881,7 @@ u16 v16a_from_v32 (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v16a;
+  return v.v16.a;
 }
 
 u16 v16b_from_v32 (const u32 v32)
@@ -890,15 +890,15 @@ u16 v16b_from_v32 (const u32 v32)
 
   v.v32 = v32;
 
-  return v.v16b;
+  return v.v16.b;
 }
 
 u32 v32_from_v16ab (const u16 v16a, const u16 v16b)
 {
   vconv32_t v;
 
-  v.v16a = v16a;
-  v.v16b = v16b;
+  v.v16.a = v16a;
+  v.v16.b = v16b;
 
   return v.v32;
 }
@@ -909,7 +909,7 @@ u32 v32a_from_v64 (const u64 v64)
 
   v.v64 = v64;
 
-  return v.v32a;
+  return v.v32.a;
 }
 
 u32 v32b_from_v64 (const u64 v64)
@@ -918,15 +918,15 @@ u32 v32b_from_v64 (const u64 v64)
 
   v.v64 = v64;
 
-  return v.v32b;
+  return v.v32.b;
 }
 
 u64 v64_from_v32ab (const u32 v32a, const u32 v32b)
 {
   vconv64_t v;
 
-  v.v32a = v32a;
-  v.v32b = v32b;
+  v.v32.a = v32a;
+  v.v32.b = v32b;
 
   return v.v64;
 }
diff --git a/src/cpt.c b/src/cpt.c
index 90ce8728d..72db45415 100644
--- a/src/cpt.c
+++ b/src/cpt.c
@@ -18,7 +18,7 @@ int cpt_ctx_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->example_hashes == true) return 0;
   if (user_options->keyspace       == true) return 0;
   if (user_options->left           == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
   if (user_options->show           == true) return 0;
   if (user_options->usage          == true) return 0;
   if (user_options->version        == true) return 0;
diff --git a/src/debugfile.c b/src/debugfile.c
index f6519534e..5e0a62bbe 100644
--- a/src/debugfile.c
+++ b/src/debugfile.c
@@ -90,7 +90,7 @@ int debugfile_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->example_hashes == true) return 0;
   if (user_options->keyspace       == true) return 0;
   if (user_options->left           == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
   if (user_options->show           == true) return 0;
   if (user_options->stdout_flag    == true) return 0;
   if (user_options->speed_only     == true) return 0;
diff --git a/src/dictstat.c b/src/dictstat.c
index 628c15187..7ee0e4297 100644
--- a/src/dictstat.c
+++ b/src/dictstat.c
@@ -56,7 +56,7 @@ int dictstat_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->example_hashes == true) return 0;
   if (user_options->keyspace       == true) return 0;
   if (user_options->left           == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
   if (user_options->show           == true) return 0;
   if (user_options->usage          == true) return 0;
   if (user_options->version        == true) return 0;
diff --git a/src/dispatch.c b/src/dispatch.c
index e43a23ad0..6f841124a 100644
--- a/src/dispatch.c
+++ b/src/dispatch.c
@@ -7,7 +7,7 @@
 #include "types.h"
 #include "event.h"
 #include "memory.h"
-#include "opencl.h"
+#include "backend.h"
 #include "wordlist.h"
 #include "shared.h"
 #include "thread.h"
@@ -23,13 +23,13 @@
 
 static u64 get_highest_words_done (const hashcat_ctx_t *hashcat_ctx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   u64 words_cur = 0;
 
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
   {
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
     if (device_param->skipped == true) continue;
 
@@ -45,13 +45,13 @@ static u64 get_highest_words_done (const hashcat_ctx_t *hashcat_ctx)
 
 static u64 get_lowest_words_done (const hashcat_ctx_t *hashcat_ctx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   u64 words_cur = 0xffffffffffffffff;
 
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
   {
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
     if (device_param->skipped == true) continue;
 
@@ -76,20 +76,20 @@ static int set_kernel_power_final (hashcat_ctx_t *hashcat_ctx, const u64 kernel_
 {
   EVENT (EVENT_SET_KERNEL_POWER_FINAL);
 
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  opencl_ctx->kernel_power_final = kernel_power_final;
+  backend_ctx->kernel_power_final = kernel_power_final;
 
   return 0;
 }
 
-static u64 get_power (opencl_ctx_t *opencl_ctx, hc_device_param_t *device_param)
+static u64 get_power (backend_ctx_t *backend_ctx, hc_device_param_t *device_param)
 {
-  const u64 kernel_power_final = opencl_ctx->kernel_power_final;
+  const u64 kernel_power_final = backend_ctx->kernel_power_final;
 
   if (kernel_power_final)
   {
-    const double device_factor = (double) device_param->hardware_power / opencl_ctx->hardware_power_all;
+    const double device_factor = (double) device_param->hardware_power / backend_ctx->hardware_power_all;
 
     const u64 words_left_device = (u64) CEIL (kernel_power_final * device_factor);
 
@@ -109,7 +109,7 @@ static u64 get_power (opencl_ctx_t *opencl_ctx, hc_device_param_t *device_param)
 
 static u64 get_work (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 max)
 {
-  opencl_ctx_t   *opencl_ctx   = hashcat_ctx->opencl_ctx;
+  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
   status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
   user_options_t *user_options = hashcat_ctx->user_options;
 
@@ -120,19 +120,19 @@ static u64 get_work (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
   device_param->words_off = words_off;
 
-  const u64 kernel_power_all = opencl_ctx->kernel_power_all;
+  const u64 kernel_power_all = backend_ctx->kernel_power_all;
 
   const u64 words_left = words_base - words_off;
 
   if (words_left < kernel_power_all)
   {
-    if (opencl_ctx->kernel_power_final == 0)
+    if (backend_ctx->kernel_power_final == 0)
     {
       set_kernel_power_final (hashcat_ctx, words_left);
     }
   }
 
-  const u64 kernel_power = get_power (opencl_ctx, device_param);
+  const u64 kernel_power = get_power (backend_ctx, device_param);
 
   u64 work = MIN (words_left, kernel_power);
 
@@ -339,16 +339,23 @@ HC_API_CALL void *thread_calc_stdin (void *p)
 
   hashcat_ctx_t *hashcat_ctx = thread_param->hashcat_ctx;
 
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  if (opencl_ctx->enabled == false) return NULL;
+  if (backend_ctx->enabled == false) return NULL;
 
-  hc_device_param_t *device_param = opencl_ctx->devices_param + thread_param->tid;
+  hc_device_param_t *device_param = backend_ctx->devices_param + thread_param->tid;
 
   if (device_param->skipped) return NULL;
 
   if (device_param->skipped_warning == true) return NULL;
 
+  if (device_param->is_cuda == true)
+  {
+    const int rc_cuCtxSetCurrent = hc_cuCtxSetCurrent (hashcat_ctx, device_param->cuda_context);
+
+    if (rc_cuCtxSetCurrent == -1) return NULL;
+  }
+
   const int rc_calc = calc_stdin (hashcat_ctx, device_param);
 
   if (rc_calc == -1)
@@ -370,7 +377,7 @@ static int calc (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
   mask_ctx_t           *mask_ctx           = hashcat_ctx->mask_ctx;
   straight_ctx_t       *straight_ctx       = hashcat_ctx->straight_ctx;
   combinator_ctx_t     *combinator_ctx     = hashcat_ctx->combinator_ctx;
-  opencl_ctx_t         *opencl_ctx         = hashcat_ctx->opencl_ctx;
+  backend_ctx_t        *backend_ctx        = hashcat_ctx->backend_ctx;
   status_ctx_t         *status_ctx         = hashcat_ctx->status_ctx;
 
   const u32 attack_mode = user_options->attack_mode;
@@ -468,7 +475,7 @@ static int calc (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
         // this greatly reduces spam on hashcat console
 
-        const u64 pre_rejects_ignore = get_power (opencl_ctx, device_param) / 2;
+        const u64 pre_rejects_ignore = get_power (backend_ctx, device_param) / 2;
 
         while (pre_rejects > pre_rejects_ignore)
         {
@@ -801,7 +808,7 @@ static int calc (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
         // this greatly reduces spam on hashcat console
 
-        const u64 pre_rejects_ignore = get_power (opencl_ctx, device_param) / 2;
+        const u64 pre_rejects_ignore = get_power (backend_ctx, device_param) / 2;
 
         while (pre_rejects > pre_rejects_ignore)
         {
@@ -1082,7 +1089,7 @@ static int calc (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
         // this greatly reduces spam on hashcat console
 
-        const u64 pre_rejects_ignore = get_power (opencl_ctx, device_param) / 2;
+        const u64 pre_rejects_ignore = get_power (backend_ctx, device_param) / 2;
 
         while (pre_rejects > pre_rejects_ignore)
         {
@@ -1658,16 +1665,23 @@ HC_API_CALL void *thread_calc (void *p)
 
   hashcat_ctx_t *hashcat_ctx = thread_param->hashcat_ctx;
 
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  if (opencl_ctx->enabled == false) return NULL;
+  if (backend_ctx->enabled == false) return NULL;
 
-  hc_device_param_t *device_param = opencl_ctx->devices_param + thread_param->tid;
+  hc_device_param_t *device_param = backend_ctx->devices_param + thread_param->tid;
 
   if (device_param->skipped) return NULL;
 
   if (device_param->skipped_warning == true) return NULL;
 
+  if (device_param->is_cuda == true)
+  {
+    const int rc_cuCtxSetCurrent = hc_cuCtxSetCurrent (hashcat_ctx, device_param->cuda_context);
+
+    if (rc_cuCtxSetCurrent == -1) return NULL;
+  }
+
   const int rc_calc = calc (hashcat_ctx, device_param);
 
   if (rc_calc == -1)
diff --git a/src/emu_inc_platform.c b/src/emu_inc_platform.c
new file mode 100644
index 000000000..f390abf0a
--- /dev/null
+++ b/src/emu_inc_platform.c
@@ -0,0 +1,11 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "common.h"
+#include "types.h"
+#include "bitops.h"
+#include "emu_general.h"
+
+#include "inc_platform.cl"
diff --git a/src/ext_cuda.c b/src/ext_cuda.c
new file mode 100644
index 000000000..dc43e1b61
--- /dev/null
+++ b/src/ext_cuda.c
@@ -0,0 +1,8 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "common.h"
+#include "types.h"
+#include "ext_cuda.h"
diff --git a/src/ext_nvrtc.c b/src/ext_nvrtc.c
new file mode 100644
index 000000000..634caac5d
--- /dev/null
+++ b/src/ext_nvrtc.c
@@ -0,0 +1,27 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "common.h"
+#include "types.h"
+#include "ext_nvrtc.h"
+
+int nvrtc_make_options_array_from_string (char *string, char **options)
+{
+  char *saveptr = NULL;
+
+  char *next = strtok_r (string, " ", &saveptr);
+
+  int cnt = 0;
+
+  do
+  {
+    options[cnt] = next;
+
+    cnt++;
+
+  } while ((next = strtok_r ((char *) NULL, " ", &saveptr)) != NULL);
+
+  return cnt;
+}
\ No newline at end of file
diff --git a/src/hashcat.c b/src/hashcat.c
index b45319520..667b7eecc 100644
--- a/src/hashcat.c
+++ b/src/hashcat.c
@@ -34,7 +34,7 @@
 #include "loopback.h"
 #include "monitor.h"
 #include "mpsp.h"
-#include "opencl.h"
+#include "backend.h"
 #include "outfile_check.h"
 #include "outfile.h"
 #include "pidfile.h"
@@ -59,7 +59,7 @@ static int inner2_loop (hashcat_ctx_t *hashcat_ctx)
   hashes_t             *hashes              = hashcat_ctx->hashes;
   induct_ctx_t         *induct_ctx          = hashcat_ctx->induct_ctx;
   logfile_ctx_t        *logfile_ctx         = hashcat_ctx->logfile_ctx;
-  opencl_ctx_t         *opencl_ctx          = hashcat_ctx->opencl_ctx;
+  backend_ctx_t        *backend_ctx         = hashcat_ctx->backend_ctx;
   restore_ctx_t        *restore_ctx         = hashcat_ctx->restore_ctx;
   status_ctx_t         *status_ctx          = hashcat_ctx->status_ctx;
   user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
@@ -109,7 +109,7 @@ static int inner2_loop (hashcat_ctx_t *hashcat_ctx)
     user_options->skip = 0;
   }
 
-  opencl_session_reset (hashcat_ctx);
+  backend_session_reset (hashcat_ctx);
 
   cpt_ctx_reset (hashcat_ctx);
 
@@ -174,15 +174,15 @@ static int inner2_loop (hashcat_ctx_t *hashcat_ctx)
    * this is required for autotune
    */
 
-  opencl_ctx_devices_kernel_loops (hashcat_ctx);
+  backend_ctx_devices_kernel_loops (hashcat_ctx);
 
   /**
    * prepare thread buffers
    */
 
-  thread_param_t *threads_param = (thread_param_t *) hccalloc (opencl_ctx->devices_cnt, sizeof (thread_param_t));
+  thread_param_t *threads_param = (thread_param_t *) hccalloc (backend_ctx->backend_devices_cnt, sizeof (thread_param_t));
 
-  hc_thread_t *c_threads = (hc_thread_t *) hccalloc (opencl_ctx->devices_cnt, sizeof (hc_thread_t));
+  hc_thread_t *c_threads = (hc_thread_t *) hccalloc (backend_ctx->backend_devices_cnt, sizeof (hc_thread_t));
 
   /**
    * create autotune threads
@@ -192,31 +192,31 @@ static int inner2_loop (hashcat_ctx_t *hashcat_ctx)
 
   status_ctx->devices_status = STATUS_AUTOTUNE;
 
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
   {
-    thread_param_t *thread_param = threads_param + device_id;
+    thread_param_t *thread_param = threads_param + backend_devices_idx;
 
     thread_param->hashcat_ctx = hashcat_ctx;
-    thread_param->tid         = device_id;
+    thread_param->tid         = backend_devices_idx;
 
-    hc_thread_create (c_threads[device_id], thread_autotune, thread_param);
+    hc_thread_create (c_threads[backend_devices_idx], thread_autotune, thread_param);
   }
 
-  hc_thread_wait (opencl_ctx->devices_cnt, c_threads);
+  hc_thread_wait (backend_ctx->backend_devices_cnt, c_threads);
 
   EVENT (EVENT_AUTOTUNE_FINISHED);
 
   /**
-   * find same opencl devices and equal results
+   * find same backend devices and equal results
    */
 
-  opencl_ctx_devices_sync_tuning (hashcat_ctx);
+  backend_ctx_devices_sync_tuning (hashcat_ctx);
 
   /**
-   * autotune modified kernel_accel, which modifies opencl_ctx->kernel_power_all
+   * autotune modified kernel_accel, which modifies backend_ctx->kernel_power_all
    */
 
-  opencl_ctx_devices_update_power (hashcat_ctx);
+  backend_ctx_devices_update_power (hashcat_ctx);
 
   /**
    * Begin loopback recording
@@ -249,24 +249,24 @@ static int inner2_loop (hashcat_ctx_t *hashcat_ctx)
 
   status_ctx->accessible = true;
 
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
   {
-    thread_param_t *thread_param = threads_param + device_id;
+    thread_param_t *thread_param = threads_param + backend_devices_idx;
 
     thread_param->hashcat_ctx = hashcat_ctx;
-    thread_param->tid         = device_id;
+    thread_param->tid         = backend_devices_idx;
 
     if (user_options_extra->wordlist_mode == WL_MODE_STDIN)
     {
-      hc_thread_create (c_threads[device_id], thread_calc_stdin, thread_param);
+      hc_thread_create (c_threads[backend_devices_idx], thread_calc_stdin, thread_param);
     }
     else
     {
-      hc_thread_create (c_threads[device_id], thread_calc, thread_param);
+      hc_thread_create (c_threads[backend_devices_idx], thread_calc, thread_param);
     }
   }
 
-  hc_thread_wait (opencl_ctx->devices_cnt, c_threads);
+  hc_thread_wait (backend_ctx->backend_devices_cnt, c_threads);
 
   hcfree (c_threads);
 
@@ -295,7 +295,7 @@ static int inner2_loop (hashcat_ctx_t *hashcat_ctx)
     // however, that can create confusion in hashcats RC, because exhausted translates to RC = 1.
     // but then having RC = 1 does not match our expection if we use for speed-only and progress-only.
     // to get hashcat to return RC = 0 we have to set it to CRACKED or BYPASS
-    // note: other options like --show, --left, --benchmark, --keyspace, --opencl-info, etc.
+    // note: other options like --show, --left, --benchmark, --keyspace, --backend-info, etc.
     // not not reach this section of the code, they've returned already with rc 0.
 
     if ((user_options->speed_only == true) || (user_options->progress_only == true))
@@ -438,7 +438,7 @@ static int outer_loop (hashcat_ctx_t *hashcat_ctx)
   hashconfig_t   *hashconfig    = hashcat_ctx->hashconfig;
   hashes_t       *hashes        = hashcat_ctx->hashes;
   mask_ctx_t     *mask_ctx      = hashcat_ctx->mask_ctx;
-  opencl_ctx_t   *opencl_ctx    = hashcat_ctx->opencl_ctx;
+  backend_ctx_t  *backend_ctx   = hashcat_ctx->backend_ctx;
   outcheck_ctx_t *outcheck_ctx  = hashcat_ctx->outcheck_ctx;
   restore_ctx_t  *restore_ctx   = hashcat_ctx->restore_ctx;
   status_ctx_t   *status_ctx    = hashcat_ctx->status_ctx;
@@ -720,13 +720,13 @@ static int outer_loop (hashcat_ctx_t *hashcat_ctx)
    * inform the user
    */
 
-  EVENT (EVENT_OPENCL_SESSION_PRE);
+  EVENT (EVENT_BACKEND_SESSION_PRE);
 
-  const int rc_session_begin = opencl_session_begin (hashcat_ctx);
+  const int rc_session_begin = backend_session_begin (hashcat_ctx);
 
   if (rc_session_begin == -1) return -1;
 
-  EVENT (EVENT_OPENCL_SESSION_POST);
+  EVENT (EVENT_BACKEND_SESSION_POST);
 
   /**
    * create self-test threads
@@ -736,23 +736,23 @@ static int outer_loop (hashcat_ctx_t *hashcat_ctx)
   {
     EVENT (EVENT_SELFTEST_STARTING);
 
-    thread_param_t *threads_param = (thread_param_t *) hccalloc (opencl_ctx->devices_cnt, sizeof (thread_param_t));
+    thread_param_t *threads_param = (thread_param_t *) hccalloc (backend_ctx->backend_devices_cnt, sizeof (thread_param_t));
 
-    hc_thread_t *selftest_threads = (hc_thread_t *) hccalloc (opencl_ctx->devices_cnt, sizeof (hc_thread_t));
+    hc_thread_t *selftest_threads = (hc_thread_t *) hccalloc (backend_ctx->backend_devices_cnt, sizeof (hc_thread_t));
 
     status_ctx->devices_status = STATUS_SELFTEST;
 
-    for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+    for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
     {
-      thread_param_t *thread_param = threads_param + device_id;
+      thread_param_t *thread_param = threads_param + backend_devices_idx;
 
       thread_param->hashcat_ctx = hashcat_ctx;
-      thread_param->tid         = device_id;
+      thread_param->tid         = backend_devices_idx;
 
-      hc_thread_create (selftest_threads[device_id], thread_selftest, thread_param);
+      hc_thread_create (selftest_threads[backend_devices_idx], thread_selftest, thread_param);
     }
 
-    hc_thread_wait (opencl_ctx->devices_cnt, selftest_threads);
+    hc_thread_wait (backend_ctx->backend_devices_cnt, selftest_threads);
 
     hcfree (threads_param);
 
@@ -760,11 +760,11 @@ static int outer_loop (hashcat_ctx_t *hashcat_ctx)
 
     // check for any selftest failures
 
-    for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+    for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
     {
-      if (opencl_ctx->enabled == false) continue;
+      if (backend_ctx->enabled == false) continue;
 
-      hc_device_param_t *device_param = opencl_ctx->devices_param + device_id;
+      hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
 
       if (device_param->skipped == true) continue;
 
@@ -879,9 +879,9 @@ static int outer_loop (hashcat_ctx_t *hashcat_ctx)
 
   potfile_write_close (hashcat_ctx);
 
-  // finalize opencl session
+  // finalize backend session
 
-  opencl_session_destroy (hashcat_ctx);
+  backend_session_destroy (hashcat_ctx);
 
   // clean up
 
@@ -930,7 +930,7 @@ int hashcat_init (hashcat_ctx_t *hashcat_ctx, void (*event) (const u32, struct h
   hashcat_ctx->loopback_ctx       = (loopback_ctx_t *)        hcmalloc (sizeof (loopback_ctx_t));
   hashcat_ctx->mask_ctx           = (mask_ctx_t *)            hcmalloc (sizeof (mask_ctx_t));
   hashcat_ctx->module_ctx         = (module_ctx_t *)          hcmalloc (sizeof (module_ctx_t));
-  hashcat_ctx->opencl_ctx         = (opencl_ctx_t *)          hcmalloc (sizeof (opencl_ctx_t));
+  hashcat_ctx->backend_ctx        = (backend_ctx_t *)         hcmalloc (sizeof (backend_ctx_t));
   hashcat_ctx->outcheck_ctx       = (outcheck_ctx_t *)        hcmalloc (sizeof (outcheck_ctx_t));
   hashcat_ctx->outfile_ctx        = (outfile_ctx_t *)         hcmalloc (sizeof (outfile_ctx_t));
   hashcat_ctx->pidfile_ctx        = (pidfile_ctx_t *)         hcmalloc (sizeof (pidfile_ctx_t));
@@ -964,7 +964,7 @@ void hashcat_destroy (hashcat_ctx_t *hashcat_ctx)
   hcfree (hashcat_ctx->loopback_ctx);
   hcfree (hashcat_ctx->mask_ctx);
   hcfree (hashcat_ctx->module_ctx);
-  hcfree (hashcat_ctx->opencl_ctx);
+  hcfree (hashcat_ctx->backend_ctx);
   hcfree (hashcat_ctx->outcheck_ctx);
   hcfree (hashcat_ctx->outfile_ctx);
   hcfree (hashcat_ctx->pidfile_ctx);
@@ -1169,18 +1169,18 @@ int hashcat_session_init (hashcat_ctx_t *hashcat_ctx, const char *install_folder
   if (rc_user_options_check_files == -1) return -1;
 
   /**
-   * Init OpenCL library loader
+   * Init backend library loader
    */
 
-  const int rc_opencl_init = opencl_ctx_init (hashcat_ctx);
+  const int rc_backend_init = backend_ctx_init (hashcat_ctx);
 
-  if (rc_opencl_init == -1) return -1;
+  if (rc_backend_init == -1) return -1;
 
   /**
-   * Init OpenCL devices
+   * Init backend devices
    */
 
-  const int rc_devices_init = opencl_ctx_devices_init (hashcat_ctx, comptime);
+  const int rc_devices_init = backend_ctx_devices_init (hashcat_ctx, comptime);
 
   if (rc_devices_init == -1) return -1;
 
@@ -1341,25 +1341,25 @@ int hashcat_session_destroy (hashcat_ctx_t *hashcat_ctx)
   #endif
   #endif
 
-  debugfile_destroy          (hashcat_ctx);
-  dictstat_destroy           (hashcat_ctx);
-  folder_config_destroy      (hashcat_ctx);
-  hwmon_ctx_destroy          (hashcat_ctx);
-  induct_ctx_destroy         (hashcat_ctx);
-  logfile_destroy            (hashcat_ctx);
-  loopback_destroy           (hashcat_ctx);
-  opencl_ctx_devices_destroy (hashcat_ctx);
-  opencl_ctx_destroy         (hashcat_ctx);
-  outcheck_ctx_destroy       (hashcat_ctx);
-  outfile_destroy            (hashcat_ctx);
-  pidfile_ctx_destroy        (hashcat_ctx);
-  potfile_destroy            (hashcat_ctx);
-  restore_ctx_destroy        (hashcat_ctx);
-  tuning_db_destroy          (hashcat_ctx);
-  user_options_destroy       (hashcat_ctx);
-  user_options_extra_destroy (hashcat_ctx);
-  status_ctx_destroy         (hashcat_ctx);
-  event_ctx_destroy          (hashcat_ctx);
+  debugfile_destroy           (hashcat_ctx);
+  dictstat_destroy            (hashcat_ctx);
+  folder_config_destroy       (hashcat_ctx);
+  hwmon_ctx_destroy           (hashcat_ctx);
+  induct_ctx_destroy          (hashcat_ctx);
+  logfile_destroy             (hashcat_ctx);
+  loopback_destroy            (hashcat_ctx);
+  backend_ctx_devices_destroy (hashcat_ctx);
+  backend_ctx_destroy         (hashcat_ctx);
+  outcheck_ctx_destroy        (hashcat_ctx);
+  outfile_destroy             (hashcat_ctx);
+  pidfile_ctx_destroy         (hashcat_ctx);
+  potfile_destroy             (hashcat_ctx);
+  restore_ctx_destroy         (hashcat_ctx);
+  tuning_db_destroy           (hashcat_ctx);
+  user_options_destroy        (hashcat_ctx);
+  user_options_extra_destroy  (hashcat_ctx);
+  status_ctx_destroy          (hashcat_ctx);
+  event_ctx_destroy           (hashcat_ctx);
 
   return 0;
 }
diff --git a/src/hashes.c b/src/hashes.c
index 3b1aeb50a..72b9aad69 100644
--- a/src/hashes.c
+++ b/src/hashes.c
@@ -14,7 +14,7 @@
 #include "terminal.h"
 #include "logfile.h"
 #include "loopback.h"
-#include "opencl.h"
+#include "backend.h"
 #include "outfile.h"
 #include "potfile.h"
 #include "rp.h"
@@ -309,7 +309,15 @@ void check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pl
   {
     tmps = hcmalloc (hashconfig->tmp_size);
 
-    hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_tmps, CL_TRUE, plain->gidvid * hashconfig->tmp_size, hashconfig->tmp_size, tmps, 0, NULL, NULL);
+    if (device_param->is_cuda == true)
+    {
+      hc_cuMemcpyDtoH (hashcat_ctx, tmps, device_param->cuda_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size);
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tmps, CL_TRUE, plain->gidvid * hashconfig->tmp_size, hashconfig->tmp_size, tmps, 0, NULL, NULL);
+    }
   }
 
   // hash
@@ -460,15 +468,21 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
   u32 num_cracked;
 
-  cl_int CL_err;
+  int CU_rc;
+  int CL_rc;
 
-  CL_err = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_result, CL_TRUE, 0, sizeof (u32), &num_cracked, 0, NULL, NULL);
-
-  if (CL_err != CL_SUCCESS)
+  if (device_param->is_cuda == true)
   {
-    event_log_error (hashcat_ctx, "clEnqueueReadBuffer(): %s", val2cstr_cl (CL_err));
+    CU_rc = hc_cuMemcpyDtoH (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32));
 
-    return -1;
+    if (CU_rc == -1) return -1;
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_TRUE, 0, sizeof (u32), &num_cracked, 0, NULL, NULL);
+
+    if (CL_rc == -1) return -1;
   }
 
   if (user_options->speed_only == true)
@@ -483,13 +497,18 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
   {
     plain_t *cracked = (plain_t *) hccalloc (num_cracked, sizeof (plain_t));
 
-    CL_err = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_plain_bufs, CL_TRUE, 0, num_cracked * sizeof (plain_t), cracked, 0, NULL, NULL);
-
-    if (CL_err != CL_SUCCESS)
+    if (device_param->is_cuda == true)
     {
-      event_log_error (hashcat_ctx, "clEnqueueReadBuffer(): %s", val2cstr_cl (CL_err));
+      CU_rc = hc_cuMemcpyDtoH (hashcat_ctx, cracked, device_param->cuda_d_plain_bufs, num_cracked * sizeof (plain_t));
 
-      return -1;
+      if (CU_rc == -1) return -1;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_plain_bufs, CL_TRUE, 0, num_cracked * sizeof (plain_t), cracked, 0, NULL, NULL);
+
+      if (CL_rc == -1) return -1;
     }
 
     u32 cpt_cracked = 0;
@@ -553,25 +572,35 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
       memset (hashes->digests_shown_tmp, 0, salt_buf->digests_cnt * sizeof (u32));
 
-      CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_digests_shown, CL_TRUE, salt_buf->digests_offset * sizeof (u32), salt_buf->digests_cnt * sizeof (u32), &hashes->digests_shown_tmp[salt_buf->digests_offset], 0, NULL, NULL);
-
-      if (CL_err != CL_SUCCESS)
+      if (device_param->is_cuda == true)
       {
-        event_log_error (hashcat_ctx, "clEnqueueWriteBuffer(): %s", val2cstr_cl (CL_err));
+        CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_digests_shown + (salt_buf->digests_offset * sizeof (u32)), &hashes->digests_shown_tmp[salt_buf->digests_offset], salt_buf->digests_cnt * sizeof (u32));
 
-        return -1;
+        if (CU_rc == -1) return -1;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_digests_shown, CL_TRUE, salt_buf->digests_offset * sizeof (u32), salt_buf->digests_cnt * sizeof (u32), &hashes->digests_shown_tmp[salt_buf->digests_offset], 0, NULL, NULL);
+
+        if (CL_rc == -1) return -1;
       }
     }
 
     num_cracked = 0;
 
-    CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_result, CL_TRUE, 0, sizeof (u32), &num_cracked, 0, NULL, NULL);
-
-    if (CL_err != CL_SUCCESS)
+    if (device_param->is_cuda == true)
     {
-      event_log_error (hashcat_ctx, "clEnqueueWriteBuffer(): %s", val2cstr_cl (CL_err));
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_result, &num_cracked, sizeof (u32));
 
-      return -1;
+      if (CU_rc == -1) return -1;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_TRUE, 0, sizeof (u32), &num_cracked, 0, NULL, NULL);
+
+      if (CL_rc == -1) return -1;
     }
   }
 
@@ -840,7 +869,7 @@ int hashes_init_stage1 (hashcat_ctx_t *hashcat_ctx)
   else if (user_options->stdout_flag == true)
   {
   }
-  else if (user_options->opencl_info == true)
+  else if (user_options->backend_info == true)
   {
   }
   else
diff --git a/src/hwmon.c b/src/hwmon.c
index c8a568c49..2bce94a3d 100644
--- a/src/hwmon.c
+++ b/src/hwmon.c
@@ -45,11 +45,11 @@ static void sysfs_close (hashcat_ctx_t *hashcat_ctx)
   }
 }
 
-static char *hm_SYSFS_get_syspath_device (hashcat_ctx_t *hashcat_ctx, const int device_id)
+static char *hm_SYSFS_get_syspath_device (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
 {
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_device_idx];
 
   char *syspath;
 
@@ -58,9 +58,9 @@ static char *hm_SYSFS_get_syspath_device (hashcat_ctx_t *hashcat_ctx, const int
   return syspath;
 }
 
-static char *hm_SYSFS_get_syspath_hwmon (hashcat_ctx_t *hashcat_ctx, const int device_id)
+static char *hm_SYSFS_get_syspath_hwmon (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
 {
-  char *syspath = hm_SYSFS_get_syspath_device (hashcat_ctx, device_id);
+  char *syspath = hm_SYSFS_get_syspath_device (hashcat_ctx, backend_device_idx);
 
   if (syspath == NULL)
   {
@@ -96,9 +96,9 @@ static char *hm_SYSFS_get_syspath_hwmon (hashcat_ctx_t *hashcat_ctx, const int d
   return hwmon;
 }
 
-static int hm_SYSFS_get_fan_speed_current (hashcat_ctx_t *hashcat_ctx, const int device_id, int *val)
+static int hm_SYSFS_get_fan_speed_current (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx, int *val)
 {
-  char *syspath = hm_SYSFS_get_syspath_hwmon (hashcat_ctx, device_id);
+  char *syspath = hm_SYSFS_get_syspath_hwmon (hashcat_ctx, backend_device_idx);
 
   if (syspath == NULL) return -1;
 
@@ -188,9 +188,9 @@ static int hm_SYSFS_get_fan_speed_current (hashcat_ctx_t *hashcat_ctx, const int
   return 0;
 }
 
-static int hm_SYSFS_get_temperature_current (hashcat_ctx_t *hashcat_ctx, const int device_id, int *val)
+static int hm_SYSFS_get_temperature_current (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx, int *val)
 {
-  char *syspath = hm_SYSFS_get_syspath_hwmon (hashcat_ctx, device_id);
+  char *syspath = hm_SYSFS_get_syspath_hwmon (hashcat_ctx, backend_device_idx);
 
   if (syspath == NULL) return -1;
 
@@ -233,9 +233,9 @@ static int hm_SYSFS_get_temperature_current (hashcat_ctx_t *hashcat_ctx, const i
   return 0;
 }
 
-static int hm_SYSFS_get_pp_dpm_sclk (hashcat_ctx_t *hashcat_ctx, const int device_id, int *val)
+static int hm_SYSFS_get_pp_dpm_sclk (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx, int *val)
 {
-  char *syspath = hm_SYSFS_get_syspath_device (hashcat_ctx, device_id);
+  char *syspath = hm_SYSFS_get_syspath_device (hashcat_ctx, backend_device_idx);
 
   if (syspath == NULL) return -1;
 
@@ -288,9 +288,9 @@ static int hm_SYSFS_get_pp_dpm_sclk (hashcat_ctx_t *hashcat_ctx, const int devic
   return 0;
 }
 
-static int hm_SYSFS_get_pp_dpm_mclk (hashcat_ctx_t *hashcat_ctx, const int device_id, int *val)
+static int hm_SYSFS_get_pp_dpm_mclk (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx, int *val)
 {
-  char *syspath = hm_SYSFS_get_syspath_device (hashcat_ctx, device_id);
+  char *syspath = hm_SYSFS_get_syspath_device (hashcat_ctx, backend_device_idx);
 
   if (syspath == NULL) return -1;
 
@@ -343,9 +343,9 @@ static int hm_SYSFS_get_pp_dpm_mclk (hashcat_ctx_t *hashcat_ctx, const int devic
   return 0;
 }
 
-static int hm_SYSFS_get_pp_dpm_pcie (hashcat_ctx_t *hashcat_ctx, const int device_id, int *val)
+static int hm_SYSFS_get_pp_dpm_pcie (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx, int *val)
 {
-  char *syspath = hm_SYSFS_get_syspath_device (hashcat_ctx, device_id);
+  char *syspath = hm_SYSFS_get_syspath_device (hashcat_ctx, backend_device_idx);
 
   if (syspath == NULL) return -1;
 
@@ -1342,53 +1342,24 @@ static int hm_get_adapter_index_nvml (hashcat_ctx_t *hashcat_ctx, HM_ADAPTER_NVM
   return (deviceCount);
 }
 
-int hm_get_threshold_slowdown_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id)
+int hm_get_threshold_slowdown_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
 {
-  hwmon_ctx_t  *hwmon_ctx  = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   if (hwmon_ctx->enabled == false) return -1;
 
-  if (hwmon_ctx->hm_device[device_id].threshold_slowdown_get_supported == false) return -1;
+  if (hwmon_ctx->hm_device[backend_device_idx].threshold_slowdown_get_supported == false) return -1;
 
-  if ((opencl_ctx->devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_AMD)
-  {
-    if (hwmon_ctx->hm_adl)
-    {
-      if (hwmon_ctx->hm_device[device_id].od_version == 5)
-      {
-
-      }
-      else if (hwmon_ctx->hm_device[device_id].od_version == 6)
-      {
-        int CurrentValue = 0;
-        int DefaultValue = 0;
-
-        if (hm_ADL_Overdrive6_TargetTemperatureData_Get (hashcat_ctx, hwmon_ctx->hm_device[device_id].adl, &CurrentValue, &DefaultValue) == -1)
-        {
-          hwmon_ctx->hm_device[device_id].threshold_slowdown_get_supported = false;
-
-          return -1;
-        }
-
-        // the return value has never been tested since hm_ADL_Overdrive6_TargetTemperatureData_Get() never worked on any system. expect problems.
-
-        return DefaultValue;
-      }
-    }
-  }
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
+  if (backend_ctx->devices_param[backend_device_idx].is_cuda == true)
   {
     if (hwmon_ctx->hm_nvml)
     {
       int target = 0;
 
-      if (hm_NVML_nvmlDeviceGetTemperatureThreshold (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvml, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, (unsigned int *) &target) == -1)
+      if (hm_NVML_nvmlDeviceGetTemperatureThreshold (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, (unsigned int *) &target) == -1)
       {
-        hwmon_ctx->hm_device[device_id].threshold_slowdown_get_supported = false;
+        hwmon_ctx->hm_device[backend_device_idx].threshold_slowdown_get_supported = false;
 
         return -1;
       }
@@ -1397,46 +1368,78 @@ int hm_get_threshold_slowdown_with_device_id (hashcat_ctx_t *hashcat_ctx, const
     }
   }
 
-  hwmon_ctx->hm_device[device_id].threshold_slowdown_get_supported = false;
-
-  return -1;
-}
-
-int hm_get_threshold_shutdown_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id)
-{
-  hwmon_ctx_t  *hwmon_ctx  = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  if (hwmon_ctx->enabled == false) return -1;
-
-  if (hwmon_ctx->hm_device[device_id].threshold_shutdown_get_supported == false) return -1;
-
-  if ((opencl_ctx->devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_AMD)
+  if (backend_ctx->devices_param[backend_device_idx].is_opencl == true)
   {
-    if (hwmon_ctx->hm_adl)
+    if ((backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD)
     {
-      if (hwmon_ctx->hm_device[device_id].od_version == 5)
+      if (hwmon_ctx->hm_adl)
       {
+        if (hwmon_ctx->hm_device[backend_device_idx].od_version == 5)
+        {
 
+        }
+        else if (hwmon_ctx->hm_device[backend_device_idx].od_version == 6)
+        {
+          int CurrentValue = 0;
+          int DefaultValue = 0;
+
+          if (hm_ADL_Overdrive6_TargetTemperatureData_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &CurrentValue, &DefaultValue) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].threshold_slowdown_get_supported = false;
+
+            return -1;
+          }
+
+          // the return value has never been tested since hm_ADL_Overdrive6_TargetTemperatureData_Get() never worked on any system. expect problems.
+
+          return DefaultValue;
+        }
       }
-      else if (hwmon_ctx->hm_device[device_id].od_version == 6)
-      {
+    }
 
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      if (hwmon_ctx->hm_nvml)
+      {
+        int target = 0;
+
+        if (hm_NVML_nvmlDeviceGetTemperatureThreshold (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, (unsigned int *) &target) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].threshold_slowdown_get_supported = false;
+
+          return -1;
+        }
+
+        return target;
       }
     }
   }
 
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
+  hwmon_ctx->hm_device[backend_device_idx].threshold_slowdown_get_supported = false;
+
+  return -1;
+}
+
+int hm_get_threshold_shutdown_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
+{
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (hwmon_ctx->enabled == false) return -1;
+
+  if (hwmon_ctx->hm_device[backend_device_idx].threshold_shutdown_get_supported == false) return -1;
+
+  if (backend_ctx->devices_param[backend_device_idx].is_cuda == true)
   {
     if (hwmon_ctx->hm_nvml)
     {
       int target = 0;
 
-      if (hm_NVML_nvmlDeviceGetTemperatureThreshold (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvml, NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, (unsigned int *) &target) == -1)
+      if (hm_NVML_nvmlDeviceGetTemperatureThreshold (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, (unsigned int *) &target) == -1)
       {
-        hwmon_ctx->hm_device[device_id].threshold_shutdown_get_supported = false;
+        hwmon_ctx->hm_device[backend_device_idx].threshold_shutdown_get_supported = false;
 
         return -1;
       }
@@ -1445,81 +1448,66 @@ int hm_get_threshold_shutdown_with_device_id (hashcat_ctx_t *hashcat_ctx, const
     }
   }
 
-  hwmon_ctx->hm_device[device_id].threshold_shutdown_get_supported = false;
+  if (backend_ctx->devices_param[backend_device_idx].is_opencl == true)
+  {
+    if ((backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD)
+    {
+      if (hwmon_ctx->hm_adl)
+      {
+        if (hwmon_ctx->hm_device[backend_device_idx].od_version == 5)
+        {
+
+        }
+        else if (hwmon_ctx->hm_device[backend_device_idx].od_version == 6)
+        {
+
+        }
+      }
+    }
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      if (hwmon_ctx->hm_nvml)
+      {
+        int target = 0;
+
+        if (hm_NVML_nvmlDeviceGetTemperatureThreshold (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, (unsigned int *) &target) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].threshold_shutdown_get_supported = false;
+
+          return -1;
+        }
+
+        return target;
+      }
+    }
+  }
+
+  hwmon_ctx->hm_device[backend_device_idx].threshold_shutdown_get_supported = false;
 
   return -1;
 }
 
-int hm_get_temperature_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id)
+int hm_get_temperature_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
 {
-  hwmon_ctx_t  *hwmon_ctx  = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   if (hwmon_ctx->enabled == false) return -1;
 
-  if (hwmon_ctx->hm_device[device_id].temperature_get_supported == false) return -1;
+  if (hwmon_ctx->hm_device[backend_device_idx].temperature_get_supported == false) return -1;
 
-  if ((opencl_ctx->devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_AMD)
-  {
-    if (hwmon_ctx->hm_adl)
-    {
-      if (hwmon_ctx->hm_device[device_id].od_version == 5)
-      {
-        ADLTemperature Temperature;
-
-        Temperature.iSize = sizeof (ADLTemperature);
-
-        if (hm_ADL_Overdrive5_Temperature_Get (hashcat_ctx, hwmon_ctx->hm_device[device_id].adl, 0, &Temperature) == -1)
-        {
-          hwmon_ctx->hm_device[device_id].temperature_get_supported = false;
-
-          return -1;
-        }
-
-        return Temperature.iTemperature / 1000;
-      }
-
-      if (hwmon_ctx->hm_device[device_id].od_version == 6)
-      {
-        int Temperature = 0;
-
-        if (hm_ADL_Overdrive6_Temperature_Get (hashcat_ctx, hwmon_ctx->hm_device[device_id].adl, &Temperature) == -1)
-        {
-          hwmon_ctx->hm_device[device_id].temperature_get_supported = false;
-
-          return -1;
-        }
-
-        return Temperature / 1000;
-      }
-    }
-
-    if (hwmon_ctx->hm_sysfs)
-    {
-      int temperature = 0;
-
-      if (hm_SYSFS_get_temperature_current (hashcat_ctx, device_id, &temperature) == -1)
-      {
-        hwmon_ctx->hm_device[device_id].temperature_get_supported = false;
-
-        return -1;
-      }
-
-      return temperature;
-    }
-  }
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
+  if (backend_ctx->devices_param[backend_device_idx].is_cuda == true)
   {
     if (hwmon_ctx->hm_nvml)
     {
       int temperature = 0;
 
-      if (hm_NVML_nvmlDeviceGetTemperature (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvml, NVML_TEMPERATURE_GPU, (u32 *) &temperature) == -1)
+      if (hm_NVML_nvmlDeviceGetTemperature (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, NVML_TEMPERATURE_GPU, (u32 *) &temperature) == -1)
       {
-        hwmon_ctx->hm_device[device_id].temperature_get_supported = false;
+        hwmon_ctx->hm_device[backend_device_idx].temperature_get_supported = false;
 
         return -1;
       }
@@ -1528,145 +1516,167 @@ int hm_get_temperature_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 dev
     }
   }
 
-  hwmon_ctx->hm_device[device_id].temperature_get_supported = false;
-
-  return -1;
-}
-
-int hm_get_fanpolicy_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id)
-{
-  hwmon_ctx_t  *hwmon_ctx  = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  if (hwmon_ctx->enabled == false) return -1;
-
-  if (hwmon_ctx->hm_device[device_id].fanpolicy_get_supported == false) return -1;
-
-  if ((opencl_ctx->devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_AMD)
+  if (backend_ctx->devices_param[backend_device_idx].is_opencl == true)
   {
-    if (hwmon_ctx->hm_adl)
+    if ((backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD)
     {
-      if (hwmon_ctx->hm_device[device_id].od_version == 5)
+      if (hwmon_ctx->hm_adl)
       {
-        ADLFanSpeedValue lpFanSpeedValue;
-
-        memset (&lpFanSpeedValue, 0, sizeof (lpFanSpeedValue));
-
-        lpFanSpeedValue.iSize      = sizeof (lpFanSpeedValue);
-        lpFanSpeedValue.iSpeedType = ADL_DL_FANCTRL_SPEED_TYPE_PERCENT;
-
-        if (hm_ADL_Overdrive5_FanSpeed_Get (hashcat_ctx, hwmon_ctx->hm_device[device_id].adl, 0, &lpFanSpeedValue) == -1)
+        if (hwmon_ctx->hm_device[backend_device_idx].od_version == 5)
         {
-          hwmon_ctx->hm_device[device_id].fanpolicy_get_supported = false;
-          hwmon_ctx->hm_device[device_id].fanspeed_get_supported  = false;
+          ADLTemperature Temperature;
+
+          Temperature.iSize = sizeof (ADLTemperature);
+
+          if (hm_ADL_Overdrive5_Temperature_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, 0, &Temperature) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].temperature_get_supported = false;
+
+            return -1;
+          }
+
+          return Temperature.iTemperature / 1000;
+        }
+
+        if (hwmon_ctx->hm_device[backend_device_idx].od_version == 6)
+        {
+          int Temperature = 0;
+
+          if (hm_ADL_Overdrive6_Temperature_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &Temperature) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].temperature_get_supported = false;
+
+            return -1;
+          }
+
+          return Temperature / 1000;
+        }
+      }
+
+      if (hwmon_ctx->hm_sysfs)
+      {
+        int temperature = 0;
+
+        if (hm_SYSFS_get_temperature_current (hashcat_ctx, backend_device_idx, &temperature) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].temperature_get_supported = false;
 
           return -1;
         }
 
-        return (lpFanSpeedValue.iFanSpeed & ADL_DL_FANCTRL_FLAG_USER_DEFINED_SPEED) ? 0 : 1;
+        return temperature;
+      }
+    }
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      if (hwmon_ctx->hm_nvml)
+      {
+        int temperature = 0;
+
+        if (hm_NVML_nvmlDeviceGetTemperature (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, NVML_TEMPERATURE_GPU, (u32 *) &temperature) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].temperature_get_supported = false;
+
+          return -1;
+        }
+
+        return temperature;
+      }
+    }
+  }
+
+  hwmon_ctx->hm_device[backend_device_idx].temperature_get_supported = false;
+
+  return -1;
+}
+
+int hm_get_fanpolicy_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
+{
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (hwmon_ctx->enabled == false) return -1;
+
+  if (hwmon_ctx->hm_device[backend_device_idx].fanpolicy_get_supported == false) return -1;
+
+  if (backend_ctx->devices_param[backend_device_idx].is_cuda == true)
+  {
+    return 1;
+  }
+
+  if (backend_ctx->devices_param[backend_device_idx].is_opencl == true)
+  {
+    if ((backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD)
+    {
+      if (hwmon_ctx->hm_adl)
+      {
+        if (hwmon_ctx->hm_device[backend_device_idx].od_version == 5)
+        {
+          ADLFanSpeedValue lpFanSpeedValue;
+
+          memset (&lpFanSpeedValue, 0, sizeof (lpFanSpeedValue));
+
+          lpFanSpeedValue.iSize      = sizeof (lpFanSpeedValue);
+          lpFanSpeedValue.iSpeedType = ADL_DL_FANCTRL_SPEED_TYPE_PERCENT;
+
+          if (hm_ADL_Overdrive5_FanSpeed_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, 0, &lpFanSpeedValue) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].fanpolicy_get_supported = false;
+            hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported  = false;
+
+            return -1;
+          }
+
+          return (lpFanSpeedValue.iFanSpeed & ADL_DL_FANCTRL_FLAG_USER_DEFINED_SPEED) ? 0 : 1;
+        }
+
+        if (hwmon_ctx->hm_device[backend_device_idx].od_version == 6)
+        {
+          return 1;
+        }
       }
 
-      if (hwmon_ctx->hm_device[device_id].od_version == 6)
+      if (hwmon_ctx->hm_sysfs)
       {
         return 1;
       }
     }
 
-    if (hwmon_ctx->hm_sysfs)
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
     {
       return 1;
     }
   }
 
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
-  {
-    return 1;
-  }
-
-  hwmon_ctx->hm_device[device_id].fanpolicy_get_supported = false;
-  hwmon_ctx->hm_device[device_id].fanspeed_get_supported  = false;
+  hwmon_ctx->hm_device[backend_device_idx].fanpolicy_get_supported = false;
+  hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported  = false;
 
   return -1;
 }
 
-int hm_get_fanspeed_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id)
+int hm_get_fanspeed_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
 {
-  hwmon_ctx_t  *hwmon_ctx  = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   if (hwmon_ctx->enabled == false) return -1;
 
-  if (hwmon_ctx->hm_device[device_id].fanspeed_get_supported == false) return -1;
+  if (hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported == false) return -1;
 
-  if ((opencl_ctx->devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_AMD)
-  {
-    if (hwmon_ctx->hm_adl)
-    {
-      if (hwmon_ctx->hm_device[device_id].od_version == 5)
-      {
-        ADLFanSpeedValue lpFanSpeedValue;
-
-        memset (&lpFanSpeedValue, 0, sizeof (lpFanSpeedValue));
-
-        lpFanSpeedValue.iSize      = sizeof (lpFanSpeedValue);
-        lpFanSpeedValue.iSpeedType = ADL_DL_FANCTRL_SPEED_TYPE_PERCENT;
-        lpFanSpeedValue.iFlags     = ADL_DL_FANCTRL_FLAG_USER_DEFINED_SPEED;
-
-        if (hm_ADL_Overdrive5_FanSpeed_Get (hashcat_ctx, hwmon_ctx->hm_device[device_id].adl, 0, &lpFanSpeedValue) == -1)
-        {
-          hwmon_ctx->hm_device[device_id].fanspeed_get_supported = false;
-
-          return -1;
-        }
-
-        return lpFanSpeedValue.iFanSpeed;
-      }
-
-      if (hwmon_ctx->hm_device[device_id].od_version == 6)
-      {
-        ADLOD6FanSpeedInfo faninfo;
-
-        memset (&faninfo, 0, sizeof (faninfo));
-
-        if (hm_ADL_Overdrive6_FanSpeed_Get (hashcat_ctx, hwmon_ctx->hm_device[device_id].adl, &faninfo) == -1)
-        {
-          hwmon_ctx->hm_device[device_id].fanspeed_get_supported = false;
-
-          return -1;
-        }
-
-        return faninfo.iFanSpeedPercent;
-      }
-    }
-
-    if (hwmon_ctx->hm_sysfs)
-    {
-      int speed = 0;
-
-      if (hm_SYSFS_get_fan_speed_current (hashcat_ctx, device_id, &speed) == -1)
-      {
-        hwmon_ctx->hm_device[device_id].fanspeed_get_supported = false;
-
-        return -1;
-      }
-
-      return speed;
-    }
-  }
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
+  if (backend_ctx->devices_param[backend_device_idx].is_cuda == true)
   {
     if (hwmon_ctx->hm_nvml)
     {
       int speed = 0;
 
-      if (hm_NVML_nvmlDeviceGetFanSpeed (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvml, (u32 *) &speed) == -1)
+      if (hm_NVML_nvmlDeviceGetFanSpeed (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, (u32 *) &speed) == -1)
       {
-        hwmon_ctx->hm_device[device_id].fanspeed_get_supported = false;
+        hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported = false;
 
         return -1;
       }
@@ -1675,64 +1685,107 @@ int hm_get_fanspeed_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device
     }
   }
 
-  hwmon_ctx->hm_device[device_id].fanspeed_get_supported = false;
+  if (backend_ctx->devices_param[backend_device_idx].is_opencl == true)
+  {
+    if ((backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD)
+    {
+      if (hwmon_ctx->hm_adl)
+      {
+        if (hwmon_ctx->hm_device[backend_device_idx].od_version == 5)
+        {
+          ADLFanSpeedValue lpFanSpeedValue;
+
+          memset (&lpFanSpeedValue, 0, sizeof (lpFanSpeedValue));
+
+          lpFanSpeedValue.iSize      = sizeof (lpFanSpeedValue);
+          lpFanSpeedValue.iSpeedType = ADL_DL_FANCTRL_SPEED_TYPE_PERCENT;
+          lpFanSpeedValue.iFlags     = ADL_DL_FANCTRL_FLAG_USER_DEFINED_SPEED;
+
+          if (hm_ADL_Overdrive5_FanSpeed_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, 0, &lpFanSpeedValue) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported = false;
+
+            return -1;
+          }
+
+          return lpFanSpeedValue.iFanSpeed;
+        }
+
+        if (hwmon_ctx->hm_device[backend_device_idx].od_version == 6)
+        {
+          ADLOD6FanSpeedInfo faninfo;
+
+          memset (&faninfo, 0, sizeof (faninfo));
+
+          if (hm_ADL_Overdrive6_FanSpeed_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &faninfo) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported = false;
+
+            return -1;
+          }
+
+          return faninfo.iFanSpeedPercent;
+        }
+      }
+
+      if (hwmon_ctx->hm_sysfs)
+      {
+        int speed = 0;
+
+        if (hm_SYSFS_get_fan_speed_current (hashcat_ctx, backend_device_idx, &speed) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported = false;
+
+          return -1;
+        }
+
+        return speed;
+      }
+    }
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      if (hwmon_ctx->hm_nvml)
+      {
+        int speed = 0;
+
+        if (hm_NVML_nvmlDeviceGetFanSpeed (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, (u32 *) &speed) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported = false;
+
+          return -1;
+        }
+
+        return speed;
+      }
+    }
+  }
+
+  hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported = false;
 
   return -1;
 }
 
-int hm_get_buslanes_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id)
+int hm_get_buslanes_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
 {
-  hwmon_ctx_t  *hwmon_ctx  = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   if (hwmon_ctx->enabled == false) return -1;
 
-  if (hwmon_ctx->hm_device[device_id].buslanes_get_supported == false) return -1;
+  if (hwmon_ctx->hm_device[backend_device_idx].buslanes_get_supported == false) return -1;
 
-  if ((opencl_ctx->devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_AMD)
-  {
-    if (hwmon_ctx->hm_adl)
-    {
-      ADLPMActivity PMActivity;
-
-      PMActivity.iSize = sizeof (ADLPMActivity);
-
-      if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[device_id].adl, &PMActivity) == -1)
-      {
-        hwmon_ctx->hm_device[device_id].buslanes_get_supported = false;
-
-        return -1;
-      }
-
-      return PMActivity.iCurrentBusLanes;
-    }
-
-    if (hwmon_ctx->hm_sysfs)
-    {
-      int lanes;
-
-      if (hm_SYSFS_get_pp_dpm_pcie (hashcat_ctx, device_id, &lanes) == -1)
-      {
-        hwmon_ctx->hm_device[device_id].buslanes_get_supported = false;
-
-        return -1;
-      }
-
-      return lanes;
-    }
-  }
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
+  if (backend_ctx->devices_param[backend_device_idx].is_cuda == true)
   {
     if (hwmon_ctx->hm_nvml)
     {
       unsigned int currLinkWidth;
 
-      if (hm_NVML_nvmlDeviceGetCurrPcieLinkWidth (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvml, &currLinkWidth) == -1)
+      if (hm_NVML_nvmlDeviceGetCurrPcieLinkWidth (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, &currLinkWidth) == -1)
       {
-        hwmon_ctx->hm_device[device_id].buslanes_get_supported = false;
+        hwmon_ctx->hm_device[backend_device_idx].buslanes_get_supported = false;
 
         return -1;
       }
@@ -1741,50 +1794,84 @@ int hm_get_buslanes_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device
     }
   }
 
-  hwmon_ctx->hm_device[device_id].buslanes_get_supported = false;
+  if (backend_ctx->devices_param[backend_device_idx].is_opencl == true)
+  {
+    if ((backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD)
+    {
+      if (hwmon_ctx->hm_adl)
+      {
+        ADLPMActivity PMActivity;
+
+        PMActivity.iSize = sizeof (ADLPMActivity);
+
+        if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].buslanes_get_supported = false;
+
+          return -1;
+        }
+
+        return PMActivity.iCurrentBusLanes;
+      }
+
+      if (hwmon_ctx->hm_sysfs)
+      {
+        int lanes;
+
+        if (hm_SYSFS_get_pp_dpm_pcie (hashcat_ctx, backend_device_idx, &lanes) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].buslanes_get_supported = false;
+
+          return -1;
+        }
+
+        return lanes;
+      }
+    }
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      if (hwmon_ctx->hm_nvml)
+      {
+        unsigned int currLinkWidth;
+
+        if (hm_NVML_nvmlDeviceGetCurrPcieLinkWidth (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, &currLinkWidth) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].buslanes_get_supported = false;
+
+          return -1;
+        }
+
+        return currLinkWidth;
+      }
+    }
+  }
+
+  hwmon_ctx->hm_device[backend_device_idx].buslanes_get_supported = false;
 
   return -1;
 }
 
-int hm_get_utilization_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id)
+int hm_get_utilization_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
 {
-  hwmon_ctx_t  *hwmon_ctx  = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   if (hwmon_ctx->enabled == false) return -1;
 
-  if (hwmon_ctx->hm_device[device_id].utilization_get_supported == false) return -1;
+  if (hwmon_ctx->hm_device[backend_device_idx].utilization_get_supported == false) return -1;
 
-  if ((opencl_ctx->devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_AMD)
-  {
-    if (hwmon_ctx->hm_adl)
-    {
-      ADLPMActivity PMActivity;
-
-      PMActivity.iSize = sizeof (ADLPMActivity);
-
-      if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[device_id].adl, &PMActivity) == -1)
-      {
-        hwmon_ctx->hm_device[device_id].utilization_get_supported = false;
-
-        return -1;
-      }
-
-      return PMActivity.iActivityPercent;
-    }
-  }
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
+  if (backend_ctx->devices_param[backend_device_idx].is_cuda == true)
   {
     if (hwmon_ctx->hm_nvml)
     {
       nvmlUtilization_t utilization;
 
-      if (hm_NVML_nvmlDeviceGetUtilizationRates (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvml, &utilization) == -1)
+      if (hm_NVML_nvmlDeviceGetUtilizationRates (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, &utilization) == -1)
       {
-        hwmon_ctx->hm_device[device_id].utilization_get_supported = false;
+        hwmon_ctx->hm_device[backend_device_idx].utilization_get_supported = false;
 
         return -1;
       }
@@ -1793,64 +1880,70 @@ int hm_get_utilization_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 dev
     }
   }
 
-  hwmon_ctx->hm_device[device_id].utilization_get_supported = false;
+  if (backend_ctx->devices_param[backend_device_idx].is_opencl == true)
+  {
+    if ((backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD)
+    {
+      if (hwmon_ctx->hm_adl)
+      {
+        ADLPMActivity PMActivity;
+
+        PMActivity.iSize = sizeof (ADLPMActivity);
+
+        if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].utilization_get_supported = false;
+
+          return -1;
+        }
+
+        return PMActivity.iActivityPercent;
+      }
+    }
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      if (hwmon_ctx->hm_nvml)
+      {
+        nvmlUtilization_t utilization;
+
+        if (hm_NVML_nvmlDeviceGetUtilizationRates (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, &utilization) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].utilization_get_supported = false;
+
+          return -1;
+        }
+
+        return utilization.gpu;
+      }
+    }
+  }
+
+  hwmon_ctx->hm_device[backend_device_idx].utilization_get_supported = false;
 
   return -1;
 }
 
-int hm_get_memoryspeed_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id)
+int hm_get_memoryspeed_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
 {
-  hwmon_ctx_t  *hwmon_ctx  = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   if (hwmon_ctx->enabled == false) return -1;
 
-  if (hwmon_ctx->hm_device[device_id].memoryspeed_get_supported == false) return -1;
+  if (hwmon_ctx->hm_device[backend_device_idx].memoryspeed_get_supported == false) return -1;
 
-  if ((opencl_ctx->devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_AMD)
-  {
-    if (hwmon_ctx->hm_adl)
-    {
-      ADLPMActivity PMActivity;
-
-      PMActivity.iSize = sizeof (ADLPMActivity);
-
-      if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[device_id].adl, &PMActivity) == -1)
-      {
-        hwmon_ctx->hm_device[device_id].memoryspeed_get_supported = false;
-
-        return -1;
-      }
-
-      return PMActivity.iMemoryClock / 100;
-    }
-
-    if (hwmon_ctx->hm_sysfs)
-    {
-      int clockfreq;
-
-      if (hm_SYSFS_get_pp_dpm_mclk (hashcat_ctx, device_id, &clockfreq) == -1)
-      {
-        hwmon_ctx->hm_device[device_id].memoryspeed_get_supported = false;
-
-        return -1;
-      }
-
-      return clockfreq;
-    }
-  }
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
+  if (backend_ctx->devices_param[backend_device_idx].is_cuda == true)
   {
     if (hwmon_ctx->hm_nvml)
     {
       unsigned int clockfreq;
 
-      if (hm_NVML_nvmlDeviceGetClockInfo (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvml, NVML_CLOCK_MEM, &clockfreq) == -1)
+      if (hm_NVML_nvmlDeviceGetClockInfo (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, NVML_CLOCK_MEM, &clockfreq) == -1)
       {
-        hwmon_ctx->hm_device[device_id].memoryspeed_get_supported = false;
+        hwmon_ctx->hm_device[backend_device_idx].memoryspeed_get_supported = false;
 
         return -1;
       }
@@ -1859,64 +1952,84 @@ int hm_get_memoryspeed_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 dev
     }
   }
 
-  hwmon_ctx->hm_device[device_id].memoryspeed_get_supported = false;
+  if (backend_ctx->devices_param[backend_device_idx].is_opencl == true)
+  {
+    if ((backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD)
+    {
+      if (hwmon_ctx->hm_adl)
+      {
+        ADLPMActivity PMActivity;
+
+        PMActivity.iSize = sizeof (ADLPMActivity);
+
+        if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].memoryspeed_get_supported = false;
+
+          return -1;
+        }
+
+        return PMActivity.iMemoryClock / 100;
+      }
+
+      if (hwmon_ctx->hm_sysfs)
+      {
+        int clockfreq;
+
+        if (hm_SYSFS_get_pp_dpm_mclk (hashcat_ctx, backend_device_idx, &clockfreq) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].memoryspeed_get_supported = false;
+
+          return -1;
+        }
+
+        return clockfreq;
+      }
+    }
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      if (hwmon_ctx->hm_nvml)
+      {
+        unsigned int clockfreq;
+
+        if (hm_NVML_nvmlDeviceGetClockInfo (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, NVML_CLOCK_MEM, &clockfreq) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].memoryspeed_get_supported = false;
+
+          return -1;
+        }
+
+        return clockfreq;
+      }
+    }
+  }
+
+  hwmon_ctx->hm_device[backend_device_idx].memoryspeed_get_supported = false;
 
   return -1;
 }
 
-int hm_get_corespeed_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id)
+int hm_get_corespeed_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
 {
-  hwmon_ctx_t  *hwmon_ctx  = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   if (hwmon_ctx->enabled == false) return -1;
 
-  if (hwmon_ctx->hm_device[device_id].corespeed_get_supported == false) return -1;
+  if (hwmon_ctx->hm_device[backend_device_idx].corespeed_get_supported == false) return -1;
 
-  if ((opencl_ctx->devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_AMD)
-  {
-    if (hwmon_ctx->hm_adl)
-    {
-      ADLPMActivity PMActivity;
-
-      PMActivity.iSize = sizeof (ADLPMActivity);
-
-      if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[device_id].adl, &PMActivity) == -1)
-      {
-        hwmon_ctx->hm_device[device_id].corespeed_get_supported = false;
-
-        return -1;
-      }
-
-      return PMActivity.iEngineClock / 100;
-    }
-
-    if (hwmon_ctx->hm_sysfs)
-    {
-      int clockfreq;
-
-      if (hm_SYSFS_get_pp_dpm_sclk (hashcat_ctx, device_id, &clockfreq) == -1)
-      {
-        hwmon_ctx->hm_device[device_id].corespeed_get_supported = false;
-
-        return -1;
-      }
-
-      return clockfreq;
-    }
-  }
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
+  if (backend_ctx->devices_param[backend_device_idx].is_cuda == true)
   {
     if (hwmon_ctx->hm_nvml)
     {
       unsigned int clockfreq;
 
-      if (hm_NVML_nvmlDeviceGetClockInfo (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvml, NVML_CLOCK_SM, &clockfreq) == -1)
+      if (hm_NVML_nvmlDeviceGetClockInfo (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, NVML_CLOCK_SM, &clockfreq) == -1)
       {
-        hwmon_ctx->hm_device[device_id].corespeed_get_supported = false;
+        hwmon_ctx->hm_device[backend_device_idx].corespeed_get_supported = false;
 
         return -1;
       }
@@ -1925,27 +2038,76 @@ int hm_get_corespeed_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 devic
     }
   }
 
-  hwmon_ctx->hm_device[device_id].corespeed_get_supported = false;
+  if (backend_ctx->devices_param[backend_device_idx].is_opencl == true)
+  {
+    if ((backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD)
+    {
+      if (hwmon_ctx->hm_adl)
+      {
+        ADLPMActivity PMActivity;
+
+        PMActivity.iSize = sizeof (ADLPMActivity);
+
+        if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].corespeed_get_supported = false;
+
+          return -1;
+        }
+
+        return PMActivity.iEngineClock / 100;
+      }
+
+      if (hwmon_ctx->hm_sysfs)
+      {
+        int clockfreq;
+
+        if (hm_SYSFS_get_pp_dpm_sclk (hashcat_ctx, backend_device_idx, &clockfreq) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].corespeed_get_supported = false;
+
+          return -1;
+        }
+
+        return clockfreq;
+      }
+    }
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      if (hwmon_ctx->hm_nvml)
+      {
+        unsigned int clockfreq;
+
+        if (hm_NVML_nvmlDeviceGetClockInfo (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, NVML_CLOCK_SM, &clockfreq) == -1)
+        {
+          hwmon_ctx->hm_device[backend_device_idx].corespeed_get_supported = false;
+
+          return -1;
+        }
+
+        return clockfreq;
+      }
+    }
+  }
+
+  hwmon_ctx->hm_device[backend_device_idx].corespeed_get_supported = false;
 
   return -1;
 }
 
-int hm_get_throttle_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device_id)
+int hm_get_throttle_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
 {
-  hwmon_ctx_t  *hwmon_ctx  = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   if (hwmon_ctx->enabled == false) return -1;
 
-  if (hwmon_ctx->hm_device[device_id].throttle_get_supported == false) return -1;
+  if (hwmon_ctx->hm_device[backend_device_idx].throttle_get_supported == false) return -1;
 
-  if ((opencl_ctx->devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_AMD)
-  {
-  }
-
-  if (opencl_ctx->devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
+  if (backend_ctx->devices_param[backend_device_idx].is_cuda == true)
   {
     if (hwmon_ctx->hm_nvml)
     {
@@ -1953,15 +2115,15 @@ int hm_get_throttle_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device
       unsigned long long clocksThrottleReasons = 0;
       unsigned long long supportedThrottleReasons = 0;
 
-      if (hm_NVML_nvmlDeviceGetCurrentClocksThrottleReasons   (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvml, &clocksThrottleReasons)    == -1) return -1;
-      if (hm_NVML_nvmlDeviceGetSupportedClocksThrottleReasons (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvml, &supportedThrottleReasons) == -1) return -1;
+      if (hm_NVML_nvmlDeviceGetCurrentClocksThrottleReasons   (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, &clocksThrottleReasons)    == -1) return -1;
+      if (hm_NVML_nvmlDeviceGetSupportedClocksThrottleReasons (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, &supportedThrottleReasons) == -1) return -1;
 
       clocksThrottleReasons &=  supportedThrottleReasons;
       clocksThrottleReasons &= ~nvmlClocksThrottleReasonGpuIdle;
       clocksThrottleReasons &= ~nvmlClocksThrottleReasonApplicationsClocksSetting;
       clocksThrottleReasons &= ~nvmlClocksThrottleReasonUnknown;
 
-      if (opencl_ctx->kernel_power_final)
+      if (backend_ctx->kernel_power_final)
       {
         clocksThrottleReasons &= ~nvmlClocksThrottleReasonHwSlowdown;
       }
@@ -1981,17 +2143,72 @@ int hm_get_throttle_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device
       perfPolicies_info.version   = MAKE_NVAPI_VERSION (NV_GPU_PERF_POLICIES_INFO_PARAMS_V1, 1);
       perfPolicies_status.version = MAKE_NVAPI_VERSION (NV_GPU_PERF_POLICIES_STATUS_PARAMS_V1, 1);
 
-      hm_NvAPI_GPU_GetPerfPoliciesInfo (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvapi, &perfPolicies_info);
+      hm_NvAPI_GPU_GetPerfPoliciesInfo (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvapi, &perfPolicies_info);
 
       perfPolicies_status.info_value = perfPolicies_info.info_value;
 
-      hm_NvAPI_GPU_GetPerfPoliciesStatus (hashcat_ctx, hwmon_ctx->hm_device[device_id].nvapi, &perfPolicies_status);
+      hm_NvAPI_GPU_GetPerfPoliciesStatus (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvapi, &perfPolicies_status);
 
       return perfPolicies_status.throttle & 2;
     }
   }
 
-  hwmon_ctx->hm_device[device_id].throttle_get_supported = false;
+  if (backend_ctx->devices_param[backend_device_idx].is_opencl == true)
+  {
+    if ((backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD)
+    {
+    }
+
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      if (hwmon_ctx->hm_nvml)
+      {
+        /* this is triggered by mask generator, too. therefore useless
+        unsigned long long clocksThrottleReasons = 0;
+        unsigned long long supportedThrottleReasons = 0;
+
+        if (hm_NVML_nvmlDeviceGetCurrentClocksThrottleReasons   (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, &clocksThrottleReasons)    == -1) return -1;
+        if (hm_NVML_nvmlDeviceGetSupportedClocksThrottleReasons (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, &supportedThrottleReasons) == -1) return -1;
+
+        clocksThrottleReasons &=  supportedThrottleReasons;
+        clocksThrottleReasons &= ~nvmlClocksThrottleReasonGpuIdle;
+        clocksThrottleReasons &= ~nvmlClocksThrottleReasonApplicationsClocksSetting;
+        clocksThrottleReasons &= ~nvmlClocksThrottleReasonUnknown;
+
+        if (backend_ctx->kernel_power_final)
+        {
+          clocksThrottleReasons &= ~nvmlClocksThrottleReasonHwSlowdown;
+        }
+
+        return (clocksThrottleReasons != nvmlClocksThrottleReasonNone);
+        */
+      }
+
+      if (hwmon_ctx->hm_nvapi)
+      {
+        NV_GPU_PERF_POLICIES_INFO_PARAMS_V1   perfPolicies_info;
+        NV_GPU_PERF_POLICIES_STATUS_PARAMS_V1 perfPolicies_status;
+
+        memset (&perfPolicies_info,   0, sizeof (NV_GPU_PERF_POLICIES_INFO_PARAMS_V1));
+        memset (&perfPolicies_status, 0, sizeof (NV_GPU_PERF_POLICIES_STATUS_PARAMS_V1));
+
+        perfPolicies_info.version   = MAKE_NVAPI_VERSION (NV_GPU_PERF_POLICIES_INFO_PARAMS_V1, 1);
+        perfPolicies_status.version = MAKE_NVAPI_VERSION (NV_GPU_PERF_POLICIES_STATUS_PARAMS_V1, 1);
+
+        hm_NvAPI_GPU_GetPerfPoliciesInfo (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvapi, &perfPolicies_info);
+
+        perfPolicies_status.info_value = perfPolicies_info.info_value;
+
+        hm_NvAPI_GPU_GetPerfPoliciesStatus (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvapi, &perfPolicies_status);
+
+        return perfPolicies_status.throttle & 2;
+      }
+    }
+  }
+
+  hwmon_ctx->hm_device[backend_device_idx].throttle_get_supported = false;
 
   return -1;
 }
@@ -1999,7 +2216,7 @@ int hm_get_throttle_with_device_id (hashcat_ctx_t *hashcat_ctx, const u32 device
 int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 {
   hwmon_ctx_t    *hwmon_ctx    = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t   *opencl_ctx   = hashcat_ctx->opencl_ctx;
+  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
   user_options_t *user_options = hashcat_ctx->user_options;
 
   hwmon_ctx->enabled = false;
@@ -2008,15 +2225,15 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
   return 0;
   #endif // WITH_HWMON
 
-  if (user_options->example_hashes    == true) return 0;
-  if (user_options->keyspace          == true) return 0;
-  if (user_options->left              == true) return 0;
-  if (user_options->opencl_info       == true) return 0;
-  if (user_options->show              == true) return 0;
-  if (user_options->stdout_flag       == true) return 0;
-  if (user_options->usage             == true) return 0;
-  if (user_options->version           == true) return 0;
-  if (user_options->hwmon_disable     == true) return 0;
+  if (user_options->example_hashes  == true) return 0;
+  if (user_options->keyspace        == true) return 0;
+  if (user_options->left            == true) return 0;
+  if (user_options->backend_info    == true) return 0;
+  if (user_options->show            == true) return 0;
+  if (user_options->stdout_flag     == true) return 0;
+  if (user_options->usage           == true) return 0;
+  if (user_options->version         == true) return 0;
+  if (user_options->hwmon_disable   == true) return 0;
 
   hwmon_ctx->hm_device = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
 
@@ -2037,7 +2254,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
     hcfree (hm_adapters_sysfs); \
   }
 
-  if (opencl_ctx->need_nvml == true)
+  if (backend_ctx->need_nvml == true)
   {
     hwmon_ctx->hm_nvml = (NVML_PTR *) hcmalloc (sizeof (NVML_PTR));
 
@@ -2049,7 +2266,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
     }
   }
 
-  if ((opencl_ctx->need_nvapi == true) && (hwmon_ctx->hm_nvml)) // nvapi can't work alone, we need nvml, too
+  if ((backend_ctx->need_nvapi == true) && (hwmon_ctx->hm_nvml)) // nvapi can't work alone, we need nvml, too
   {
     hwmon_ctx->hm_nvapi = (NVAPI_PTR *) hcmalloc (sizeof (NVAPI_PTR));
 
@@ -2061,7 +2278,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
     }
   }
 
-  if (opencl_ctx->need_adl == true)
+  if (backend_ctx->need_adl == true)
   {
     hwmon_ctx->hm_adl = (ADL_PTR *) hcmalloc (sizeof (ADL_PTR));
 
@@ -2073,7 +2290,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
     }
   }
 
-  if (opencl_ctx->need_sysfs == true)
+  if (backend_ctx->need_sysfs == true)
   {
     hwmon_ctx->hm_sysfs = (SYSFS_PTR *) hcmalloc (sizeof (SYSFS_PTR));
 
@@ -2102,40 +2319,73 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
       int tmp_in = hm_get_adapter_index_nvml (hashcat_ctx, nvmlGPUHandle);
 
-      for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+      for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
       {
-        hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
         if (device_param->skipped == true) continue;
 
-        if ((device_param->device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
-
-        if (device_param->device_vendor_id != VENDOR_ID_NV) continue;
-
-        for (int i = 0; i < tmp_in; i++)
+        if (device_param->is_cuda == true)
         {
-          nvmlPciInfo_t pci;
-
-          int rc = hm_NVML_nvmlDeviceGetPciInfo (hashcat_ctx, nvmlGPUHandle[i], &pci);
-
-          if (rc == -1) continue;
-
-          if ((device_param->pcie_bus      == pci.bus)
-           && (device_param->pcie_device   == (pci.device >> 3))
-           && (device_param->pcie_function == (pci.device & 7)))
+          for (int i = 0; i < tmp_in; i++)
           {
-            const u32 platform_devices_id = device_param->platform_devices_id;
+            nvmlPciInfo_t pci;
 
-            hm_adapters_nvml[platform_devices_id].nvml = nvmlGPUHandle[i];
+            int rc = hm_NVML_nvmlDeviceGetPciInfo (hashcat_ctx, nvmlGPUHandle[i], &pci);
 
-            hm_adapters_nvml[platform_devices_id].buslanes_get_supported            = true;
-            hm_adapters_nvml[platform_devices_id].corespeed_get_supported           = true;
-            hm_adapters_nvml[platform_devices_id].fanspeed_get_supported            = true;
-            hm_adapters_nvml[platform_devices_id].memoryspeed_get_supported         = true;
-            hm_adapters_nvml[platform_devices_id].temperature_get_supported         = true;
-            hm_adapters_nvml[platform_devices_id].threshold_shutdown_get_supported  = true;
-            hm_adapters_nvml[platform_devices_id].threshold_slowdown_get_supported  = true;
-            hm_adapters_nvml[platform_devices_id].utilization_get_supported         = true;
+            if (rc == -1) continue;
+
+            if ((device_param->pcie_bus      == pci.bus)
+             && (device_param->pcie_device   == (pci.device >> 3))
+             && (device_param->pcie_function == (pci.device & 7)))
+            {
+              const u32 device_id = device_param->device_id;
+
+              hm_adapters_nvml[device_id].nvml = nvmlGPUHandle[i];
+
+              hm_adapters_nvml[device_id].buslanes_get_supported            = true;
+              hm_adapters_nvml[device_id].corespeed_get_supported           = true;
+              hm_adapters_nvml[device_id].fanspeed_get_supported            = true;
+              hm_adapters_nvml[device_id].memoryspeed_get_supported         = true;
+              hm_adapters_nvml[device_id].temperature_get_supported         = true;
+              hm_adapters_nvml[device_id].threshold_shutdown_get_supported  = true;
+              hm_adapters_nvml[device_id].threshold_slowdown_get_supported  = true;
+              hm_adapters_nvml[device_id].utilization_get_supported         = true;
+            }
+          }
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+
+          if (device_param->opencl_device_vendor_id != VENDOR_ID_NV) continue;
+
+          for (int i = 0; i < tmp_in; i++)
+          {
+            nvmlPciInfo_t pci;
+
+            int rc = hm_NVML_nvmlDeviceGetPciInfo (hashcat_ctx, nvmlGPUHandle[i], &pci);
+
+            if (rc == -1) continue;
+
+            if ((device_param->pcie_bus      == pci.bus)
+             && (device_param->pcie_device   == (pci.device >> 3))
+             && (device_param->pcie_function == (pci.device & 7)))
+            {
+              const u32 device_id = device_param->device_id;
+
+              hm_adapters_nvml[device_id].nvml = nvmlGPUHandle[i];
+
+              hm_adapters_nvml[device_id].buslanes_get_supported            = true;
+              hm_adapters_nvml[device_id].corespeed_get_supported           = true;
+              hm_adapters_nvml[device_id].fanspeed_get_supported            = true;
+              hm_adapters_nvml[device_id].memoryspeed_get_supported         = true;
+              hm_adapters_nvml[device_id].temperature_get_supported         = true;
+              hm_adapters_nvml[device_id].threshold_shutdown_get_supported  = true;
+              hm_adapters_nvml[device_id].threshold_slowdown_get_supported  = true;
+              hm_adapters_nvml[device_id].utilization_get_supported         = true;
+            }
           }
         }
       }
@@ -2152,39 +2402,71 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
       int tmp_in = hm_get_adapter_index_nvapi (hashcat_ctx, nvGPUHandle);
 
-      for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+      for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
       {
-        hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
         if (device_param->skipped == true) continue;
 
-        if ((device_param->device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
-
-        if (device_param->device_vendor_id != VENDOR_ID_NV) continue;
-
-        for (int i = 0; i < tmp_in; i++)
+        if (device_param->is_cuda == true)
         {
-          NvU32 BusId     = 0;
-          NvU32 BusSlotId = 0;
-
-          int rc1 = hm_NvAPI_GPU_GetBusId (hashcat_ctx, nvGPUHandle[i], &BusId);
-
-          if (rc1 == -1) continue;
-
-          int rc2 = hm_NvAPI_GPU_GetBusSlotId (hashcat_ctx, nvGPUHandle[i], &BusSlotId);
-
-          if (rc2 == -1) continue;
-
-          if ((device_param->pcie_bus      == BusId)
-           && (device_param->pcie_device   == (BusSlotId >> 3))
-           && (device_param->pcie_function == (BusSlotId & 7)))
+          for (int i = 0; i < tmp_in; i++)
           {
-            const u32 platform_devices_id = device_param->platform_devices_id;
+            NvU32 BusId     = 0;
+            NvU32 BusSlotId = 0;
 
-            hm_adapters_nvapi[platform_devices_id].nvapi = nvGPUHandle[i];
+            int rc1 = hm_NvAPI_GPU_GetBusId (hashcat_ctx, nvGPUHandle[i], &BusId);
 
-            hm_adapters_nvapi[platform_devices_id].fanpolicy_get_supported  = true;
-            hm_adapters_nvapi[platform_devices_id].throttle_get_supported   = true;
+            if (rc1 == -1) continue;
+
+            int rc2 = hm_NvAPI_GPU_GetBusSlotId (hashcat_ctx, nvGPUHandle[i], &BusSlotId);
+
+            if (rc2 == -1) continue;
+
+            if ((device_param->pcie_bus      == BusId)
+             && (device_param->pcie_device   == (BusSlotId >> 3))
+             && (device_param->pcie_function == (BusSlotId & 7)))
+            {
+              const u32 device_id = device_param->device_id;
+
+              hm_adapters_nvapi[device_id].nvapi = nvGPUHandle[i];
+
+              hm_adapters_nvapi[device_id].fanpolicy_get_supported  = true;
+              hm_adapters_nvapi[device_id].throttle_get_supported   = true;
+            }
+          }
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+
+          if (device_param->opencl_device_vendor_id != VENDOR_ID_NV) continue;
+
+          for (int i = 0; i < tmp_in; i++)
+          {
+            NvU32 BusId     = 0;
+            NvU32 BusSlotId = 0;
+
+            int rc1 = hm_NvAPI_GPU_GetBusId (hashcat_ctx, nvGPUHandle[i], &BusId);
+
+            if (rc1 == -1) continue;
+
+            int rc2 = hm_NvAPI_GPU_GetBusSlotId (hashcat_ctx, nvGPUHandle[i], &BusSlotId);
+
+            if (rc2 == -1) continue;
+
+            if ((device_param->pcie_bus      == BusId)
+             && (device_param->pcie_device   == (BusSlotId >> 3))
+             && (device_param->pcie_function == (BusSlotId & 7)))
+            {
+              const u32 device_id = device_param->device_id;
+
+              hm_adapters_nvapi[device_id].nvapi = nvGPUHandle[i];
+
+              hm_adapters_nvapi[device_id].fanpolicy_get_supported  = true;
+              hm_adapters_nvapi[device_id].throttle_get_supported   = true;
+            }
           }
         }
       }
@@ -2221,42 +2503,50 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
         return -1;
       }
 
-      for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+      for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
       {
-        hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
         if (device_param->skipped == true) continue;
 
-        if ((device_param->device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
-
-        if (device_param->device_vendor_id != VENDOR_ID_AMD) continue;
-
-        for (int i = 0; i < tmp_in; i++)
+        if (device_param->is_cuda == true)
         {
-          if ((device_param->pcie_bus      == lpAdapterInfo[i].iBusNumber)
-           && (device_param->pcie_device   == (lpAdapterInfo[i].iDeviceNumber >> 3))
-           && (device_param->pcie_function == (lpAdapterInfo[i].iDeviceNumber & 7)))
+          // nothing to do
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+
+          if (device_param->opencl_device_vendor_id != VENDOR_ID_AMD) continue;
+
+          for (int i = 0; i < tmp_in; i++)
           {
-            const u32 platform_devices_id = device_param->platform_devices_id;
+            if ((device_param->pcie_bus      == lpAdapterInfo[i].iBusNumber)
+             && (device_param->pcie_device   == (lpAdapterInfo[i].iDeviceNumber >> 3))
+             && (device_param->pcie_function == (lpAdapterInfo[i].iDeviceNumber & 7)))
+            {
+              const u32 device_id = device_param->device_id;
 
-            int od_supported = 0;
-            int od_enabled   = 0;
-            int od_version   = 0;
+              int od_supported = 0;
+              int od_enabled   = 0;
+              int od_version   = 0;
 
-            hm_ADL_Overdrive_Caps (hashcat_ctx, lpAdapterInfo[i].iAdapterIndex, &od_supported, &od_enabled, &od_version);
+              hm_ADL_Overdrive_Caps (hashcat_ctx, lpAdapterInfo[i].iAdapterIndex, &od_supported, &od_enabled, &od_version);
 
-            hm_adapters_adl[platform_devices_id].od_version = od_version;
+              hm_adapters_adl[device_id].od_version = od_version;
 
-            hm_adapters_adl[platform_devices_id].adl = lpAdapterInfo[i].iAdapterIndex;
+              hm_adapters_adl[device_id].adl = lpAdapterInfo[i].iAdapterIndex;
 
-            hm_adapters_adl[platform_devices_id].buslanes_get_supported            = true;
-            hm_adapters_adl[platform_devices_id].corespeed_get_supported           = true;
-            hm_adapters_adl[platform_devices_id].fanspeed_get_supported            = true;
-            hm_adapters_adl[platform_devices_id].fanpolicy_get_supported           = true;
-            hm_adapters_adl[platform_devices_id].memoryspeed_get_supported         = true;
-            hm_adapters_adl[platform_devices_id].temperature_get_supported         = true;
-            hm_adapters_adl[platform_devices_id].threshold_slowdown_get_supported  = true;
-            hm_adapters_adl[platform_devices_id].utilization_get_supported         = true;
+              hm_adapters_adl[device_id].buslanes_get_supported            = true;
+              hm_adapters_adl[device_id].corespeed_get_supported           = true;
+              hm_adapters_adl[device_id].fanspeed_get_supported            = true;
+              hm_adapters_adl[device_id].fanpolicy_get_supported           = true;
+              hm_adapters_adl[device_id].memoryspeed_get_supported         = true;
+              hm_adapters_adl[device_id].temperature_get_supported         = true;
+              hm_adapters_adl[device_id].threshold_slowdown_get_supported  = true;
+              hm_adapters_adl[device_id].utilization_get_supported         = true;
+            }
           }
         }
       }
@@ -2271,22 +2561,30 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
     {
       int hm_adapters_id = 0;
 
-      for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+      for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
       {
-        hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
-        if ((device_param->device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+        if (device_param->is_cuda == true)
+        {
+          // nothing to do
+        }
 
-        hm_adapters_sysfs[hm_adapters_id].sysfs = device_id;
+        if (device_param->is_opencl == true)
+        {
+          if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
 
-        hm_adapters_sysfs[hm_adapters_id].buslanes_get_supported    = true;
-        hm_adapters_sysfs[hm_adapters_id].corespeed_get_supported   = true;
-        hm_adapters_sysfs[hm_adapters_id].fanspeed_get_supported    = true;
-        hm_adapters_sysfs[hm_adapters_id].fanpolicy_get_supported   = true;
-        hm_adapters_sysfs[hm_adapters_id].memoryspeed_get_supported = true;
-        hm_adapters_sysfs[hm_adapters_id].temperature_get_supported = true;
+          hm_adapters_sysfs[hm_adapters_id].sysfs = backend_devices_idx; // ????
 
-        hm_adapters_id++;
+          hm_adapters_sysfs[hm_adapters_id].buslanes_get_supported    = true;
+          hm_adapters_sysfs[hm_adapters_id].corespeed_get_supported   = true;
+          hm_adapters_sysfs[hm_adapters_id].fanspeed_get_supported    = true;
+          hm_adapters_sysfs[hm_adapters_id].fanpolicy_get_supported   = true;
+          hm_adapters_sysfs[hm_adapters_id].memoryspeed_get_supported = true;
+          hm_adapters_sysfs[hm_adapters_id].temperature_get_supported = true;
+
+          hm_adapters_id++;
+        }
       }
     }
   }
@@ -2308,111 +2606,151 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
    * save buffer required for later restores
    */
 
-  hwmon_ctx->od_clock_mem_status = (ADLOD6MemClockState *) hccalloc (opencl_ctx->devices_cnt, sizeof (ADLOD6MemClockState));
+  hwmon_ctx->od_clock_mem_status = (ADLOD6MemClockState *) hccalloc (backend_ctx->backend_devices_cnt, sizeof (ADLOD6MemClockState));
 
   /**
    * HM devices: copy
    */
 
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
   {
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
     if (device_param->skipped == true) continue;
 
-    if ((device_param->device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+    const u32 device_id = device_param->device_id;
 
-    const u32 platform_devices_id = device_param->platform_devices_id;
-
-    if (device_param->device_vendor_id == VENDOR_ID_AMD)
+    if (device_param->is_cuda == true)
     {
-      hwmon_ctx->hm_device[device_id].adl         = hm_adapters_adl[platform_devices_id].adl;
-      hwmon_ctx->hm_device[device_id].sysfs       = hm_adapters_sysfs[platform_devices_id].sysfs;
-      hwmon_ctx->hm_device[device_id].nvapi       = 0;
-      hwmon_ctx->hm_device[device_id].nvml        = 0;
-      hwmon_ctx->hm_device[device_id].od_version  = 0;
-
-      if (hwmon_ctx->hm_adl)
-      {
-        hwmon_ctx->hm_device[device_id].od_version = hm_adapters_adl[platform_devices_id].od_version;
-
-        hwmon_ctx->hm_device[device_id].buslanes_get_supported            |= hm_adapters_adl[platform_devices_id].buslanes_get_supported;
-        hwmon_ctx->hm_device[device_id].corespeed_get_supported           |= hm_adapters_adl[platform_devices_id].corespeed_get_supported;
-        hwmon_ctx->hm_device[device_id].fanspeed_get_supported            |= hm_adapters_adl[platform_devices_id].fanspeed_get_supported;
-        hwmon_ctx->hm_device[device_id].fanpolicy_get_supported           |= hm_adapters_adl[platform_devices_id].fanpolicy_get_supported;
-        hwmon_ctx->hm_device[device_id].memoryspeed_get_supported         |= hm_adapters_adl[platform_devices_id].memoryspeed_get_supported;
-        hwmon_ctx->hm_device[device_id].temperature_get_supported         |= hm_adapters_adl[platform_devices_id].temperature_get_supported;
-        hwmon_ctx->hm_device[device_id].threshold_shutdown_get_supported  |= hm_adapters_adl[platform_devices_id].threshold_shutdown_get_supported;
-        hwmon_ctx->hm_device[device_id].threshold_slowdown_get_supported  |= hm_adapters_adl[platform_devices_id].threshold_slowdown_get_supported;
-        hwmon_ctx->hm_device[device_id].throttle_get_supported            |= hm_adapters_adl[platform_devices_id].throttle_get_supported;
-        hwmon_ctx->hm_device[device_id].utilization_get_supported         |= hm_adapters_adl[platform_devices_id].utilization_get_supported;
-      }
-
-      if (hwmon_ctx->hm_sysfs)
-      {
-        hwmon_ctx->hm_device[device_id].buslanes_get_supported            |= hm_adapters_sysfs[platform_devices_id].buslanes_get_supported;
-        hwmon_ctx->hm_device[device_id].corespeed_get_supported           |= hm_adapters_sysfs[platform_devices_id].corespeed_get_supported;
-        hwmon_ctx->hm_device[device_id].fanspeed_get_supported            |= hm_adapters_sysfs[platform_devices_id].fanspeed_get_supported;
-        hwmon_ctx->hm_device[device_id].fanpolicy_get_supported           |= hm_adapters_sysfs[platform_devices_id].fanpolicy_get_supported;
-        hwmon_ctx->hm_device[device_id].memoryspeed_get_supported         |= hm_adapters_sysfs[platform_devices_id].memoryspeed_get_supported;
-        hwmon_ctx->hm_device[device_id].temperature_get_supported         |= hm_adapters_sysfs[platform_devices_id].temperature_get_supported;
-        hwmon_ctx->hm_device[device_id].threshold_shutdown_get_supported  |= hm_adapters_sysfs[platform_devices_id].threshold_shutdown_get_supported;
-        hwmon_ctx->hm_device[device_id].threshold_slowdown_get_supported  |= hm_adapters_sysfs[platform_devices_id].threshold_slowdown_get_supported;
-        hwmon_ctx->hm_device[device_id].throttle_get_supported            |= hm_adapters_sysfs[platform_devices_id].throttle_get_supported;
-        hwmon_ctx->hm_device[device_id].utilization_get_supported         |= hm_adapters_sysfs[platform_devices_id].utilization_get_supported;
-      }
-    }
-
-    if (device_param->device_vendor_id == VENDOR_ID_NV)
-    {
-      hwmon_ctx->hm_device[device_id].adl         = 0;
-      hwmon_ctx->hm_device[device_id].sysfs       = 0;
-      hwmon_ctx->hm_device[device_id].nvapi       = hm_adapters_nvapi[platform_devices_id].nvapi;
-      hwmon_ctx->hm_device[device_id].nvml        = hm_adapters_nvml[platform_devices_id].nvml;
-      hwmon_ctx->hm_device[device_id].od_version  = 0;
+      hwmon_ctx->hm_device[backend_devices_idx].adl         = 0;
+      hwmon_ctx->hm_device[backend_devices_idx].sysfs       = 0;
+      hwmon_ctx->hm_device[backend_devices_idx].nvapi       = hm_adapters_nvapi[device_id].nvapi;
+      hwmon_ctx->hm_device[backend_devices_idx].nvml        = hm_adapters_nvml[device_id].nvml;
+      hwmon_ctx->hm_device[backend_devices_idx].od_version  = 0;
 
       if (hwmon_ctx->hm_nvml)
       {
-        hwmon_ctx->hm_device[device_id].buslanes_get_supported            |= hm_adapters_nvml[platform_devices_id].buslanes_get_supported;
-        hwmon_ctx->hm_device[device_id].corespeed_get_supported           |= hm_adapters_nvml[platform_devices_id].corespeed_get_supported;
-        hwmon_ctx->hm_device[device_id].fanspeed_get_supported            |= hm_adapters_nvml[platform_devices_id].fanspeed_get_supported;
-        hwmon_ctx->hm_device[device_id].fanpolicy_get_supported           |= hm_adapters_nvml[platform_devices_id].fanpolicy_get_supported;
-        hwmon_ctx->hm_device[device_id].memoryspeed_get_supported         |= hm_adapters_nvml[platform_devices_id].memoryspeed_get_supported;
-        hwmon_ctx->hm_device[device_id].temperature_get_supported         |= hm_adapters_nvml[platform_devices_id].temperature_get_supported;
-        hwmon_ctx->hm_device[device_id].threshold_shutdown_get_supported  |= hm_adapters_nvml[platform_devices_id].threshold_shutdown_get_supported;
-        hwmon_ctx->hm_device[device_id].threshold_slowdown_get_supported  |= hm_adapters_nvml[platform_devices_id].threshold_slowdown_get_supported;
-        hwmon_ctx->hm_device[device_id].throttle_get_supported            |= hm_adapters_nvml[platform_devices_id].throttle_get_supported;
-        hwmon_ctx->hm_device[device_id].utilization_get_supported         |= hm_adapters_nvml[platform_devices_id].utilization_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].buslanes_get_supported            |= hm_adapters_nvml[device_id].buslanes_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].corespeed_get_supported           |= hm_adapters_nvml[device_id].corespeed_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].fanspeed_get_supported            |= hm_adapters_nvml[device_id].fanspeed_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].fanpolicy_get_supported           |= hm_adapters_nvml[device_id].fanpolicy_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].memoryspeed_get_supported         |= hm_adapters_nvml[device_id].memoryspeed_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].temperature_get_supported         |= hm_adapters_nvml[device_id].temperature_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].threshold_shutdown_get_supported  |= hm_adapters_nvml[device_id].threshold_shutdown_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_nvml[device_id].threshold_slowdown_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_nvml[device_id].throttle_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_nvml[device_id].utilization_get_supported;
       }
 
       if (hwmon_ctx->hm_nvapi)
       {
-        hwmon_ctx->hm_device[device_id].buslanes_get_supported            |= hm_adapters_nvapi[platform_devices_id].buslanes_get_supported;
-        hwmon_ctx->hm_device[device_id].corespeed_get_supported           |= hm_adapters_nvapi[platform_devices_id].corespeed_get_supported;
-        hwmon_ctx->hm_device[device_id].fanspeed_get_supported            |= hm_adapters_nvapi[platform_devices_id].fanspeed_get_supported;
-        hwmon_ctx->hm_device[device_id].fanpolicy_get_supported           |= hm_adapters_nvapi[platform_devices_id].fanpolicy_get_supported;
-        hwmon_ctx->hm_device[device_id].memoryspeed_get_supported         |= hm_adapters_nvapi[platform_devices_id].memoryspeed_get_supported;
-        hwmon_ctx->hm_device[device_id].temperature_get_supported         |= hm_adapters_nvapi[platform_devices_id].temperature_get_supported;
-        hwmon_ctx->hm_device[device_id].threshold_shutdown_get_supported  |= hm_adapters_nvapi[platform_devices_id].threshold_shutdown_get_supported;
-        hwmon_ctx->hm_device[device_id].threshold_slowdown_get_supported  |= hm_adapters_nvapi[platform_devices_id].threshold_slowdown_get_supported;
-        hwmon_ctx->hm_device[device_id].throttle_get_supported            |= hm_adapters_nvapi[platform_devices_id].throttle_get_supported;
-        hwmon_ctx->hm_device[device_id].utilization_get_supported         |= hm_adapters_nvapi[platform_devices_id].utilization_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].buslanes_get_supported            |= hm_adapters_nvapi[device_id].buslanes_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].corespeed_get_supported           |= hm_adapters_nvapi[device_id].corespeed_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].fanspeed_get_supported            |= hm_adapters_nvapi[device_id].fanspeed_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].fanpolicy_get_supported           |= hm_adapters_nvapi[device_id].fanpolicy_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].memoryspeed_get_supported         |= hm_adapters_nvapi[device_id].memoryspeed_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].temperature_get_supported         |= hm_adapters_nvapi[device_id].temperature_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].threshold_shutdown_get_supported  |= hm_adapters_nvapi[device_id].threshold_shutdown_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_nvapi[device_id].threshold_slowdown_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_nvapi[device_id].throttle_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_nvapi[device_id].utilization_get_supported;
+      }
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+
+      if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
+      {
+        hwmon_ctx->hm_device[backend_devices_idx].adl         = hm_adapters_adl[device_id].adl;
+        hwmon_ctx->hm_device[backend_devices_idx].sysfs       = hm_adapters_sysfs[device_id].sysfs;
+        hwmon_ctx->hm_device[backend_devices_idx].nvapi       = 0;
+        hwmon_ctx->hm_device[backend_devices_idx].nvml        = 0;
+        hwmon_ctx->hm_device[backend_devices_idx].od_version  = 0;
+
+        if (hwmon_ctx->hm_adl)
+        {
+          hwmon_ctx->hm_device[backend_devices_idx].od_version = hm_adapters_adl[device_id].od_version;
+
+          hwmon_ctx->hm_device[backend_devices_idx].buslanes_get_supported            |= hm_adapters_adl[device_id].buslanes_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].corespeed_get_supported           |= hm_adapters_adl[device_id].corespeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].fanspeed_get_supported            |= hm_adapters_adl[device_id].fanspeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].fanpolicy_get_supported           |= hm_adapters_adl[device_id].fanpolicy_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].memoryspeed_get_supported         |= hm_adapters_adl[device_id].memoryspeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].temperature_get_supported         |= hm_adapters_adl[device_id].temperature_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].threshold_shutdown_get_supported  |= hm_adapters_adl[device_id].threshold_shutdown_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_adl[device_id].threshold_slowdown_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_adl[device_id].throttle_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_adl[device_id].utilization_get_supported;
+        }
+
+        if (hwmon_ctx->hm_sysfs)
+        {
+          hwmon_ctx->hm_device[backend_devices_idx].buslanes_get_supported            |= hm_adapters_sysfs[device_id].buslanes_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].corespeed_get_supported           |= hm_adapters_sysfs[device_id].corespeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].fanspeed_get_supported            |= hm_adapters_sysfs[device_id].fanspeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].fanpolicy_get_supported           |= hm_adapters_sysfs[device_id].fanpolicy_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].memoryspeed_get_supported         |= hm_adapters_sysfs[device_id].memoryspeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].temperature_get_supported         |= hm_adapters_sysfs[device_id].temperature_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].threshold_shutdown_get_supported  |= hm_adapters_sysfs[device_id].threshold_shutdown_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_sysfs[device_id].threshold_slowdown_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_sysfs[device_id].throttle_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_sysfs[device_id].utilization_get_supported;
+        }
+      }
+
+      if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
+      {
+        hwmon_ctx->hm_device[backend_devices_idx].adl         = 0;
+        hwmon_ctx->hm_device[backend_devices_idx].sysfs       = 0;
+        hwmon_ctx->hm_device[backend_devices_idx].nvapi       = hm_adapters_nvapi[device_id].nvapi;
+        hwmon_ctx->hm_device[backend_devices_idx].nvml        = hm_adapters_nvml[device_id].nvml;
+        hwmon_ctx->hm_device[backend_devices_idx].od_version  = 0;
+
+        if (hwmon_ctx->hm_nvml)
+        {
+          hwmon_ctx->hm_device[backend_devices_idx].buslanes_get_supported            |= hm_adapters_nvml[device_id].buslanes_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].corespeed_get_supported           |= hm_adapters_nvml[device_id].corespeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].fanspeed_get_supported            |= hm_adapters_nvml[device_id].fanspeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].fanpolicy_get_supported           |= hm_adapters_nvml[device_id].fanpolicy_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].memoryspeed_get_supported         |= hm_adapters_nvml[device_id].memoryspeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].temperature_get_supported         |= hm_adapters_nvml[device_id].temperature_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].threshold_shutdown_get_supported  |= hm_adapters_nvml[device_id].threshold_shutdown_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_nvml[device_id].threshold_slowdown_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_nvml[device_id].throttle_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_nvml[device_id].utilization_get_supported;
+        }
+
+        if (hwmon_ctx->hm_nvapi)
+        {
+          hwmon_ctx->hm_device[backend_devices_idx].buslanes_get_supported            |= hm_adapters_nvapi[device_id].buslanes_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].corespeed_get_supported           |= hm_adapters_nvapi[device_id].corespeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].fanspeed_get_supported            |= hm_adapters_nvapi[device_id].fanspeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].fanpolicy_get_supported           |= hm_adapters_nvapi[device_id].fanpolicy_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].memoryspeed_get_supported         |= hm_adapters_nvapi[device_id].memoryspeed_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].temperature_get_supported         |= hm_adapters_nvapi[device_id].temperature_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].threshold_shutdown_get_supported  |= hm_adapters_nvapi[device_id].threshold_shutdown_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_nvapi[device_id].threshold_slowdown_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_nvapi[device_id].throttle_get_supported;
+          hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_nvapi[device_id].utilization_get_supported;
+        }
       }
     }
 
     // by calling the different functions here this will disable them in case they will error out
     // this will also reduce the error itself printed to the user to a single print on startup
 
-    hm_get_buslanes_with_device_id            (hashcat_ctx, device_id);
-    hm_get_corespeed_with_device_id           (hashcat_ctx, device_id);
-    hm_get_fanpolicy_with_device_id           (hashcat_ctx, device_id);
-    hm_get_fanspeed_with_device_id            (hashcat_ctx, device_id);
-    hm_get_memoryspeed_with_device_id         (hashcat_ctx, device_id);
-    hm_get_temperature_with_device_id         (hashcat_ctx, device_id);
-    hm_get_threshold_shutdown_with_device_id  (hashcat_ctx, device_id);
-    hm_get_threshold_slowdown_with_device_id  (hashcat_ctx, device_id);
-    hm_get_throttle_with_device_id            (hashcat_ctx, device_id);
-    hm_get_utilization_with_device_id         (hashcat_ctx, device_id);
+    hm_get_buslanes_with_devices_idx           (hashcat_ctx, backend_devices_idx);
+    hm_get_corespeed_with_devices_idx          (hashcat_ctx, backend_devices_idx);
+    hm_get_fanpolicy_with_devices_idx          (hashcat_ctx, backend_devices_idx);
+    hm_get_fanspeed_with_devices_idx           (hashcat_ctx, backend_devices_idx);
+    hm_get_memoryspeed_with_devices_idx        (hashcat_ctx, backend_devices_idx);
+    hm_get_temperature_with_devices_idx        (hashcat_ctx, backend_devices_idx);
+    hm_get_threshold_shutdown_with_devices_idx (hashcat_ctx, backend_devices_idx);
+    hm_get_threshold_slowdown_with_devices_idx (hashcat_ctx, backend_devices_idx);
+    hm_get_throttle_with_devices_idx           (hashcat_ctx, backend_devices_idx);
+    hm_get_utilization_with_devices_idx        (hashcat_ctx, backend_devices_idx);
   }
 
   FREE_ADAPTERS;
diff --git a/src/induct.c b/src/induct.c
index 42e39e11a..46dc26eb2 100644
--- a/src/induct.c
+++ b/src/induct.c
@@ -43,7 +43,7 @@ int induct_ctx_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->example_hashes == true) return 0;
   if (user_options->keyspace       == true) return 0;
   if (user_options->left           == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
   if (user_options->show           == true) return 0;
   if (user_options->stdout_flag    == true) return 0;
   if (user_options->speed_only     == true) return 0;
diff --git a/src/interface.c b/src/interface.c
index e8d396cd4..6218da192 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -8,7 +8,7 @@
 #include "memory.h"
 #include "event.h"
 #include "shared.h"
-#include "opencl.h"
+#include "backend.h"
 #include "modules.h"
 #include "dynloader.h"
 #include "interface.h"
diff --git a/src/loopback.c b/src/loopback.c
index db4ab8c1d..057c1e456 100644
--- a/src/loopback.c
+++ b/src/loopback.c
@@ -64,7 +64,7 @@ int loopback_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->example_hashes == true) return 0;
   if (user_options->keyspace       == true) return 0;
   if (user_options->left           == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
   if (user_options->show           == true) return 0;
   if (user_options->stdout_flag    == true) return 0;
   if (user_options->speed_only     == true) return 0;
diff --git a/src/main.c b/src/main.c
index 28ea88165..c5a684f1e 100644
--- a/src/main.c
+++ b/src/main.c
@@ -190,7 +190,7 @@ static void main_outerloop_starting (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MA
 
   status_ctx->shutdown_outer = false;
 
-  if ((user_options->example_hashes == false) && (user_options->keyspace == false) && (user_options->stdout_flag == false) && (user_options->opencl_info == false) && (user_options->speed_only == false))
+  if ((user_options->example_hashes == false) && (user_options->keyspace == false) && (user_options->stdout_flag == false) && (user_options->backend_info == false) && (user_options->speed_only == false))
   {
     if ((user_options_extra->wordlist_mode == WL_MODE_FILE) || (user_options_extra->wordlist_mode == WL_MODE_MASK))
     {
@@ -263,7 +263,7 @@ static void main_cracker_finished (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYB
 
   if (user_options->example_hashes  == true) return;
   if (user_options->keyspace        == true) return;
-  if (user_options->opencl_info     == true) return;
+  if (user_options->backend_info    == true) return;
   if (user_options->stdout_flag     == true) return;
 
   // if we had a prompt, clear it
@@ -512,9 +512,9 @@ static void main_outerloop_mainscreen (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx,
   {
     if (hashconfig->has_optimized_kernel == true)
     {
-      event_log_advice (hashcat_ctx, "ATTENTION! Pure (unoptimized) OpenCL kernels selected.");
+      event_log_advice (hashcat_ctx, "ATTENTION! Pure (unoptimized) backend kernels selected.");
       event_log_advice (hashcat_ctx, "Using pure kernels enables cracking longer passwords but for the price of drastically reduced performance.");
-      event_log_advice (hashcat_ctx, "If you want to switch to optimized OpenCL kernels, append -O to your commandline.");
+      event_log_advice (hashcat_ctx, "If you want to switch to optimized backend kernels, append -O to your commandline.");
       event_log_advice (hashcat_ctx, "See the above message to find out about the exact limits.");
       event_log_advice (hashcat_ctx, NULL);
     }
@@ -549,7 +549,7 @@ static void main_outerloop_mainscreen (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx,
   event_log_info (hashcat_ctx, NULL);
 }
 
-static void main_opencl_session_pre (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
+static void main_backend_session_pre (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
 {
   const user_options_t *user_options = hashcat_ctx->user_options;
 
@@ -558,7 +558,7 @@ static void main_opencl_session_pre (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MA
   event_log_info_nn (hashcat_ctx, "Initializing device kernels and memory...");
 }
 
-static void main_opencl_session_post (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
+static void main_backend_session_post (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
 {
   const user_options_t *user_options = hashcat_ctx->user_options;
 
@@ -567,18 +567,19 @@ static void main_opencl_session_post (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, M
   event_log_info_nn (hashcat_ctx, "Initialized device kernels and memory...");
 }
 
-static void main_opencl_device_init_pre (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
+static void main_backend_session_hostmem (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
 {
   const user_options_t *user_options = hashcat_ctx->user_options;
 
   if (user_options->quiet == true) return;
 
-  const u32 *device_id = (const u32 *) buf;
+  const u64 *hostmem = (const u64 *) buf;
 
-  event_log_info_nn (hashcat_ctx, "Initializing OpenCL runtime for device #%u...", *device_id + 1);
+  event_log_info (hashcat_ctx, "Host memory required for this attack: %" PRIu64 " MB", *hostmem / (1024 * 1024));
+  event_log_info (hashcat_ctx, NULL);
 }
 
-static void main_opencl_device_init_post (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
+static void main_backend_device_init_pre (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
 {
   const user_options_t *user_options = hashcat_ctx->user_options;
 
@@ -586,7 +587,18 @@ static void main_opencl_device_init_post (MAYBE_UNUSED hashcat_ctx_t *hashcat_ct
 
   const u32 *device_id = (const u32 *) buf;
 
-  event_log_info_nn (hashcat_ctx, "Initialized OpenCL runtime for device #%u...", *device_id + 1);
+  event_log_info_nn (hashcat_ctx, "Initializing backend runtime for device #%u...", *device_id + 1);
+}
+
+static void main_backend_device_init_post (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
+{
+  const user_options_t *user_options = hashcat_ctx->user_options;
+
+  if (user_options->quiet == true) return;
+
+  const u32 *device_id = (const u32 *) buf;
+
+  event_log_info_nn (hashcat_ctx, "Initialized backend runtime for device #%u...", *device_id + 1);
 }
 
 static void main_bitmap_init_pre (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
@@ -735,7 +747,7 @@ static void main_monitor_performance_hint (MAYBE_UNUSED hashcat_ctx_t *hashcat_c
     event_log_advice (hashcat_ctx, NULL);
   }
 
-  event_log_advice (hashcat_ctx, "* Update your OpenCL runtime / driver the right way:");
+  event_log_advice (hashcat_ctx, "* Update your backend API runtime / driver the right way:");
   event_log_advice (hashcat_ctx, "  https://hashcat.net/faq/wrongdriver");
   event_log_advice (hashcat_ctx, NULL);
   event_log_advice (hashcat_ctx, "* Create more work items to make use of your parallelization power:");
@@ -1022,10 +1034,11 @@ static void event (const u32 id, hashcat_ctx_t *hashcat_ctx, const void *buf, co
     case EVENT_MONITOR_PERFORMANCE_HINT:  main_monitor_performance_hint  (hashcat_ctx, buf, len); break;
     case EVENT_MONITOR_NOINPUT_HINT:      main_monitor_noinput_hint      (hashcat_ctx, buf, len); break;
     case EVENT_MONITOR_NOINPUT_ABORT:     main_monitor_noinput_abort     (hashcat_ctx, buf, len); break;
-    case EVENT_OPENCL_SESSION_POST:       main_opencl_session_post       (hashcat_ctx, buf, len); break;
-    case EVENT_OPENCL_SESSION_PRE:        main_opencl_session_pre        (hashcat_ctx, buf, len); break;
-    case EVENT_OPENCL_DEVICE_INIT_POST:   main_opencl_device_init_post   (hashcat_ctx, buf, len); break;
-    case EVENT_OPENCL_DEVICE_INIT_PRE:    main_opencl_device_init_pre    (hashcat_ctx, buf, len); break;
+    case EVENT_BACKEND_SESSION_POST:      main_backend_session_post      (hashcat_ctx, buf, len); break;
+    case EVENT_BACKEND_SESSION_PRE:       main_backend_session_pre       (hashcat_ctx, buf, len); break;
+    case EVENT_BACKEND_SESSION_HOSTMEM:   main_backend_session_hostmem   (hashcat_ctx, buf, len); break;
+    case EVENT_BACKEND_DEVICE_INIT_POST:  main_backend_device_init_post  (hashcat_ctx, buf, len); break;
+    case EVENT_BACKEND_DEVICE_INIT_PRE:   main_backend_device_init_pre   (hashcat_ctx, buf, len); break;
     case EVENT_OUTERLOOP_FINISHED:        main_outerloop_finished        (hashcat_ctx, buf, len); break;
     case EVENT_OUTERLOOP_MAINSCREEN:      main_outerloop_mainscreen      (hashcat_ctx, buf, len); break;
     case EVENT_OUTERLOOP_STARTING:        main_outerloop_starting        (hashcat_ctx, buf, len); break;
@@ -1106,7 +1119,7 @@ int main (int argc, char **argv)
     return 0;
   }
 
-  // init a hashcat session; this initializes opencl devices, hwmon, etc
+  // init a hashcat session; this initializes backend devices, hwmon, etc
 
   welcome_screen (hashcat_ctx, VERSION_TAG);
 
@@ -1128,11 +1141,11 @@ int main (int argc, char **argv)
 
       rc_final = 0;
     }
-    else if (user_options->opencl_info == true)
+    else if (user_options->backend_info == true)
     {
-      // if this is just opencl_info, no need to execute some real cracking session
+      // if this is just backend_info, no need to execute some real cracking session
 
-      opencl_info (hashcat_ctx);
+      backend_info (hashcat_ctx);
 
       rc_final = 0;
     }
@@ -1140,7 +1153,7 @@ int main (int argc, char **argv)
     {
       // now execute hashcat
 
-      opencl_info_compact (hashcat_ctx);
+      backend_info_compact (hashcat_ctx);
 
       user_options_info (hashcat_ctx);
 
@@ -1148,7 +1161,7 @@ int main (int argc, char **argv)
     }
   }
 
-  // finish the hashcat session, this shuts down opencl devices, hwmon, etc
+  // finish the hashcat session, this shuts down backend devices, hwmon, etc
 
   hashcat_session_destroy (hashcat_ctx);
 
diff --git a/src/modules/module_00500.c b/src/modules/module_00500.c
index 59f67cb83..aef6bd8be 100644
--- a/src/modules/module_00500.c
+++ b/src/modules/module_00500.c
@@ -20,8 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "md5crypt, MD5 (Unix), Cisco-IOS $1$ (MD5)";
 static const u64   KERN_TYPE      = 500;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
-static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
-                                  | OPTS_TYPE_PREFERED_THREAD;
+static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
 static const char *ST_HASH        = "$1$38652870$DUjsu4TTlTsOe/xxZ05uf/";
diff --git a/src/modules/module_00501.c b/src/modules/module_00501.c
index 1e5935523..4eab9da57 100644
--- a/src/modules/module_00501.c
+++ b/src/modules/module_00501.c
@@ -22,7 +22,6 @@ static const char *HASH_NAME      = "Juniper IVE";
 static const u64   KERN_TYPE      = 500;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
-                                  | OPTS_TYPE_PREFERED_THREAD
                                   | OPTS_TYPE_HASH_COPY;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
diff --git a/src/modules/module_01450.c b/src/modules/module_01450.c
index 59d68bb85..59e2dc5c9 100644
--- a/src/modules/module_01450.c
+++ b/src/modules/module_01450.c
@@ -46,12 +46,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if (device_param->device_vendor_id == VENDOR_ID_AMD)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_01600.c b/src/modules/module_01600.c
index fd84d3f5e..69c62d0b1 100644
--- a/src/modules/module_01600.c
+++ b/src/modules/module_01600.c
@@ -20,8 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_SERVER;
 static const char *HASH_NAME      = "Apache $apr1$ MD5, md5apr1, MD5 (APR)";
 static const u64   KERN_TYPE      = 1600;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
-static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
-                                  | OPTS_TYPE_PREFERED_THREAD;
+static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
 static const char *ST_HASH        = "$apr1$62722340$zGjeAwVP2KwY6MtumUI1N/";
diff --git a/src/modules/module_01720.c b/src/modules/module_01720.c
index 0355b04f5..5ed6b8366 100644
--- a/src/modules/module_01720.c
+++ b/src/modules/module_01720.c
@@ -52,12 +52,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_01722.c b/src/modules/module_01722.c
index d944ab14d..7348b6da2 100644
--- a/src/modules/module_01722.c
+++ b/src/modules/module_01722.c
@@ -53,12 +53,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_01740.c b/src/modules/module_01740.c
index cc0f134ee..fd57cde23 100644
--- a/src/modules/module_01740.c
+++ b/src/modules/module_01740.c
@@ -53,12 +53,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_01750.c b/src/modules/module_01750.c
index 81004ce8f..f3a0762ba 100644
--- a/src/modules/module_01750.c
+++ b/src/modules/module_01750.c
@@ -47,12 +47,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if (device_param->device_vendor_id == VENDOR_ID_AMD)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_01760.c b/src/modules/module_01760.c
index 2d3a71c93..eac9b387c 100644
--- a/src/modules/module_01760.c
+++ b/src/modules/module_01760.c
@@ -48,12 +48,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if (device_param->device_vendor_id == VENDOR_ID_AMD)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_01800.c b/src/modules/module_01800.c
index 2dc77c887..e10175604 100644
--- a/src/modules/module_01800.c
+++ b/src/modules/module_01800.c
@@ -21,8 +21,7 @@ static const char *HASH_NAME      = "sha512crypt $6$, SHA512 (Unix)";
 static const u64   KERN_TYPE      = 1800;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
                                   | OPTI_TYPE_USES_BITS_64;
-static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
-                                  | OPTS_TYPE_PREFERED_THREAD;
+static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
 static const char *ST_HASH        = "$6$72820166$U4DVzpcYxgw7MVVDGGvB2/H5lRistD5.Ah4upwENR5UtffLR4X4SxSzfREv8z6wVl0jRFX40/KnYVvK4829kD1";
diff --git a/src/modules/module_03200.c b/src/modules/module_03200.c
index 73a8d3a76..b0b35b627 100644
--- a/src/modules/module_03200.c
+++ b/src/modules/module_03200.c
@@ -88,7 +88,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 
   u32 fixed_local_size = 0;
 
-  if (device_param->device_type & CL_DEVICE_TYPE_CPU)
+  if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
   {
     fixed_local_size = 1;
   }
@@ -96,7 +96,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   {
     u32 overhead = 0;
 
-    if (device_param->device_vendor_id == VENDOR_ID_NV)
+    if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
     {
       // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
       // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
@@ -104,7 +104,10 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
       // I did some research on this and it seems to be related with the datatype.
       // For example, if i used u8 instead, there's only 1 byte wasted.
 
-      overhead = 4;
+      if (device_param->is_opencl == true)
+      {
+        overhead = 4;
+      }
     }
 
     if (user_options->kernel_threads_chgd == true)
diff --git a/src/modules/module_06300.c b/src/modules/module_06300.c
index 05bd72b93..2d3fd3b34 100644
--- a/src/modules/module_06300.c
+++ b/src/modules/module_06300.c
@@ -20,8 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "AIX {smd5}";
 static const u64   KERN_TYPE      = 6300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
-static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
-                                  | OPTS_TYPE_PREFERED_THREAD;
+static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
 static const char *ST_HASH        = "{smd5}17800721$WkGka7tXcrfpUQS6WOQyw/";
diff --git a/src/modules/module_06400.c b/src/modules/module_06400.c
index 266ed1d9a..e1c443b8c 100644
--- a/src/modules/module_06400.c
+++ b/src/modules/module_06400.c
@@ -258,12 +258,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_06800.c b/src/modules/module_06800.c
index 3d4e18349..60a6e3b42 100644
--- a/src/modules/module_06800.c
+++ b/src/modules/module_06800.c
@@ -72,12 +72,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_07500.c b/src/modules/module_07500.c
index 1026921bc..080520f59 100644
--- a/src/modules/module_07500.c
+++ b/src/modules/module_07500.c
@@ -79,7 +79,7 @@ u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYB
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: Segmentation fault
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
     {
diff --git a/src/modules/module_07800.c b/src/modules/module_07800.c
index 8a711709f..e1285ef13 100644
--- a/src/modules/module_07800.c
+++ b/src/modules/module_07800.c
@@ -54,7 +54,7 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04:  password not found
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_07801.c b/src/modules/module_07801.c
index d49b320fe..997698939 100644
--- a/src/modules/module_07801.c
+++ b/src/modules/module_07801.c
@@ -54,7 +54,7 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04:  password not found
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_07900.c b/src/modules/module_07900.c
index 9c2d635ce..61b1be24a 100644
--- a/src/modules/module_07900.c
+++ b/src/modules/module_07900.c
@@ -286,10 +286,10 @@ static void drupal7_encode (const u8 digest[64], u8 buf[43])
 
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
-  if (device_param->platform_vendor_id == VENDOR_ID_APPLE)
+  if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
   {
     // trap 6
-    if ((device_param->device_vendor_id == VENDOR_ID_INTEL_SDK) && (device_param->device_type & CL_DEVICE_TYPE_GPU))
+    if ((device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK) && (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU))
     {
       return true;
     }
diff --git a/src/modules/module_08000.c b/src/modules/module_08000.c
index 1bf634173..a4cfc5ac8 100644
--- a/src/modules/module_08000.c
+++ b/src/modules/module_08000.c
@@ -60,7 +60,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_AMD)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_08600.c b/src/modules/module_08600.c
index 3b73ab178..e28be3e08 100644
--- a/src/modules/module_08600.c
+++ b/src/modules/module_08600.c
@@ -53,7 +53,7 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: Segmentation fault
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_09000.c b/src/modules/module_09000.c
index 7a459baaf..e8cdac075 100644
--- a/src/modules/module_09000.c
+++ b/src/modules/module_09000.c
@@ -74,27 +74,48 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
+  // this uses some nice feedback effect.
+  // based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value
+  // which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result.
+  // therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1.
+
   u32 fixed_local_size = 0;
 
-  if (device_param->device_type & CL_DEVICE_TYPE_CPU)
+  if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
   {
     fixed_local_size = 1;
   }
   else
   {
-    if (user_options->kernel_threads_chgd == true)
-    {
-      fixed_local_size = user_options->kernel_threads;
-    }
-    else
-    {
-      u32 overhead = 0;
+    u32 overhead = 0;
 
-      if (device_param->device_vendor_id == VENDOR_ID_NV)
+    if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
+      // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
+      // on my development system. no clue where the 4 bytes are spent.
+      // I did some research on this and it seems to be related with the datatype.
+      // For example, if i used u8 instead, there's only 1 byte wasted.
+
+      if (device_param->is_opencl == true)
       {
         overhead = 4;
       }
+    }
 
+    if (user_options->kernel_threads_chgd == true)
+    {
+      fixed_local_size = user_options->kernel_threads;
+
+      // otherwise out-of-bound reads
+
+      if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead))
+      {
+        fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
+      }
+    }
+    else
+    {
       fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
     }
   }
@@ -121,7 +142,7 @@ bool module_potfile_disable (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // OpenCL 1.2 pocl HSTR: pthread-x86_64-pc-linux-gnu-skylake: Segmentation fault
-  if (device_param->platform_vendor_id == VENDOR_ID_POCL)
+  if (device_param->opencl_platform_vendor_id == VENDOR_ID_POCL)
   {
     return true;
   }
diff --git a/src/modules/module_09200.c b/src/modules/module_09200.c
index c56010732..2e103343a 100644
--- a/src/modules/module_09200.c
+++ b/src/modules/module_09200.c
@@ -88,7 +88,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_09800.c b/src/modules/module_09800.c
index d9e428808..0a8107648 100644
--- a/src/modules/module_09800.c
+++ b/src/modules/module_09800.c
@@ -88,7 +88,7 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // OpenCL 1.2 pocl HSTR: pthread-x86_64-pc-linux-gnu-skylake: Segmentation fault
-  if (device_param->platform_vendor_id == VENDOR_ID_POCL)
+  if (device_param->opencl_platform_vendor_id == VENDOR_ID_POCL)
   {
     return true;
   }
diff --git a/src/modules/module_10700.c b/src/modules/module_10700.c
index 3e69fce27..dc5a16da0 100644
--- a/src/modules/module_10700.c
+++ b/src/modules/module_10700.c
@@ -109,13 +109,13 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // OpenCL 1.2 pocl HSTR: pthread-x86_64-pc-linux-gnu-skylake: Segmentation fault
-  if (device_param->platform_vendor_id == VENDOR_ID_POCL)
+  if (device_param->opencl_platform_vendor_id == VENDOR_ID_POCL)
   {
     return true;
   }
 
   // l_opencl_p_18.1.0.013: password not found
-  if (device_param->device_vendor_id == VENDOR_ID_INTEL_SDK)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK)
   {
     if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
     {
@@ -124,7 +124,7 @@ bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   }
 
   // amdgpu-pro-18.50-708488-ubuntu-18.04: Segmentation fault
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 1)
     {
@@ -133,7 +133,7 @@ bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   }
 
   // amdgpu-pro-18.50-708488-ubuntu-18.04: self-test failed.
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
     {
diff --git a/src/modules/module_10800.c b/src/modules/module_10800.c
index 840d81e95..7b653aecb 100644
--- a/src/modules/module_10800.c
+++ b/src/modules/module_10800.c
@@ -52,12 +52,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_10900.c b/src/modules/module_10900.c
index 00087a62c..2c2581c33 100644
--- a/src/modules/module_10900.c
+++ b/src/modules/module_10900.c
@@ -89,12 +89,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_11000.c b/src/modules/module_11000.c
index 3f1e12d04..e02b85a76 100644
--- a/src/modules/module_11000.c
+++ b/src/modules/module_11000.c
@@ -62,7 +62,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_11600.c b/src/modules/module_11600.c
index cc29b7d39..86165dfbb 100644
--- a/src/modules/module_11600.c
+++ b/src/modules/module_11600.c
@@ -322,7 +322,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
@@ -333,7 +333,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: Segmentation fault
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_11700.c b/src/modules/module_11700.c
index 38105da7f..02b9af2d6 100644
--- a/src/modules/module_11700.c
+++ b/src/modules/module_11700.c
@@ -44,7 +44,7 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: CL_OUT_OF_RESOURCES
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
     {
diff --git a/src/modules/module_11750.c b/src/modules/module_11750.c
index c1cefdcde..1956e6c8c 100644
--- a/src/modules/module_11750.c
+++ b/src/modules/module_11750.c
@@ -44,7 +44,7 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: CL_OUT_OF_RESOURCES
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_11760.c b/src/modules/module_11760.c
index 9f0bf39e9..0fbcc4eeb 100644
--- a/src/modules/module_11760.c
+++ b/src/modules/module_11760.c
@@ -44,7 +44,7 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: CL_OUT_OF_RESOURCES
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_11800.c b/src/modules/module_11800.c
index 30c837067..f43fce332 100644
--- a/src/modules/module_11800.c
+++ b/src/modules/module_11800.c
@@ -44,7 +44,7 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: CL_OUT_OF_RESOURCES
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
     {
diff --git a/src/modules/module_11850.c b/src/modules/module_11850.c
index a68af6734..c18476582 100644
--- a/src/modules/module_11850.c
+++ b/src/modules/module_11850.c
@@ -44,7 +44,7 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: CL_OUT_OF_RESOURCES
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_11860.c b/src/modules/module_11860.c
index e64ec7128..92e2d632d 100644
--- a/src/modules/module_11860.c
+++ b/src/modules/module_11860.c
@@ -44,7 +44,7 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: CL_OUT_OF_RESOURCES
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_12100.c b/src/modules/module_12100.c
index c4b90fb63..4881c81c2 100644
--- a/src/modules/module_12100.c
+++ b/src/modules/module_12100.c
@@ -90,7 +90,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_AMD)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_12200.c b/src/modules/module_12200.c
index 979500a15..b7ed28edd 100644
--- a/src/modules/module_12200.c
+++ b/src/modules/module_12200.c
@@ -72,12 +72,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_12500.c b/src/modules/module_12500.c
index 0390de089..bbc30f8dc 100644
--- a/src/modules/module_12500.c
+++ b/src/modules/module_12500.c
@@ -94,7 +94,7 @@ const char *module_benchmark_mask (MAYBE_UNUSED const hashconfig_t *hashconfig,
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: self-test failed
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_12800.c b/src/modules/module_12800.c
index ef6ae6f0f..4bc490512 100644
--- a/src/modules/module_12800.c
+++ b/src/modules/module_12800.c
@@ -74,12 +74,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_12900.c b/src/modules/module_12900.c
index 6821c1b7a..4fda260a7 100644
--- a/src/modules/module_12900.c
+++ b/src/modules/module_12900.c
@@ -74,12 +74,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_13000.c b/src/modules/module_13000.c
index 78cb7ca63..eeeae3aa2 100644
--- a/src/modules/module_13000.c
+++ b/src/modules/module_13000.c
@@ -87,12 +87,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_13100.c b/src/modules/module_13100.c
index 8619a0d19..5882af5a1 100644
--- a/src/modules/module_13100.c
+++ b/src/modules/module_13100.c
@@ -76,7 +76,7 @@ u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYB
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: CL_OUT_OF_RESOURCES
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
     {
diff --git a/src/modules/module_13400.c b/src/modules/module_13400.c
index 1d8c4f4ab..c251e1188 100644
--- a/src/modules/module_13400.c
+++ b/src/modules/module_13400.c
@@ -20,8 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "KeePass 1 (AES/Twofish) and KeePass 2 (AES)";
 static const u64   KERN_TYPE      = 13400;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
-static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
-                                  | OPTS_TYPE_PREFERED_THREAD;
+static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
 static const char *ST_HASH        = "$keepass$*2*24569*0*c40432355cce7348c48053ceea0a28e7d18859c4ea47e3a799c6300861f64b95*265dafcc42e1537ff42e97e1e283c70014133be0fe2d420b4d24c6d57c9d2207*a00e20a852694c15aabb074d61b902fa*48dd553fb96f7996635f2414bfe6a1a8429ef0ffb71a1752abbef31853172c35*a44ae659958ad7fae8c8952cb83f3cf03fec2371ce22a8bf7fac1e687af2f249*1*64*5a26ea376cc5afc955104c334571d30486acbac512a94b75ca82a9e31dd97bf7";
diff --git a/src/modules/module_14100.c b/src/modules/module_14100.c
index 86b56ba6f..361eca88b 100644
--- a/src/modules/module_14100.c
+++ b/src/modules/module_14100.c
@@ -111,7 +111,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_14400.c b/src/modules/module_14400.c
index 784e3e30b..15581846c 100644
--- a/src/modules/module_14400.c
+++ b/src/modules/module_14400.c
@@ -59,7 +59,7 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: Segmentation fault
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 1)
     {
diff --git a/src/modules/module_15000.c b/src/modules/module_15000.c
index 3e31238c2..4c2133d67 100644
--- a/src/modules/module_15000.c
+++ b/src/modules/module_15000.c
@@ -63,12 +63,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_15300.c b/src/modules/module_15300.c
index 90fdfad25..e14499356 100644
--- a/src/modules/module_15300.c
+++ b/src/modules/module_15300.c
@@ -22,8 +22,7 @@ static const char *HASH_NAME      = "DPAPI masterkey file v1";
 static const u64   KERN_TYPE      = 15300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
-static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
-                                  | OPTS_TYPE_PREFERED_THREAD;
+static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
 static const char *ST_HASH        = "$DPAPImk$1*1*S-15-21-466364039-425773974-453930460-1925*des3*sha1*24000*b038489dee5ad04e3e3cab4d957258b5*208*cb9b5b7d96a0d2a00305ca403d3fd9c47c561e35b4b2cf3aebfd1d3199a6481d56972be7ebd6c291b199e6f1c2ffaee91978706737e9b1209e6c7d3aa3d8c3c3e38ad1ccfa39400d62c2415961c17fd0bd6b0f7bbd49cc1de1a394e64b7237f56244238da8d37d78";
@@ -100,7 +99,7 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: self-test failed
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_15600.c b/src/modules/module_15600.c
index 12b939426..67a9388c6 100644
--- a/src/modules/module_15600.c
+++ b/src/modules/module_15600.c
@@ -90,12 +90,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_15700.c b/src/modules/module_15700.c
index 40c444fa7..258b1f7b0 100644
--- a/src/modules/module_15700.c
+++ b/src/modules/module_15700.c
@@ -248,7 +248,7 @@ u64 module_extra_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UN
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: Segmentation fault
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_15900.c b/src/modules/module_15900.c
index f239dc7a7..2b19f5213 100644
--- a/src/modules/module_15900.c
+++ b/src/modules/module_15900.c
@@ -100,7 +100,7 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: self-test failed
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_16200.c b/src/modules/module_16200.c
index 827a2d1d4..601156681 100644
--- a/src/modules/module_16200.c
+++ b/src/modules/module_16200.c
@@ -80,12 +80,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_16300.c b/src/modules/module_16300.c
index 4f2fe3752..433a804df 100644
--- a/src/modules/module_16300.c
+++ b/src/modules/module_16300.c
@@ -81,12 +81,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_16700.c b/src/modules/module_16700.c
index 829d32db1..91c20048f 100644
--- a/src/modules/module_16700.c
+++ b/src/modules/module_16700.c
@@ -80,12 +80,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_16900.c b/src/modules/module_16900.c
index e936485ed..b4ef8258a 100644
--- a/src/modules/module_16900.c
+++ b/src/modules/module_16900.c
@@ -91,12 +91,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_17300.c b/src/modules/module_17300.c
index 14d6d92c2..0778a8f5d 100644
--- a/src/modules/module_17300.c
+++ b/src/modules/module_17300.c
@@ -48,7 +48,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   char *jit_build_options = NULL;
 
   // -Wpass-failed=transform-warning
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_17400.c b/src/modules/module_17400.c
index ecee00700..171c3c5b4 100644
--- a/src/modules/module_17400.c
+++ b/src/modules/module_17400.c
@@ -48,7 +48,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   char *jit_build_options = NULL;
 
   // -Wpass-failed=transform-warning
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_17500.c b/src/modules/module_17500.c
index 98ed35e10..9b663af2a 100644
--- a/src/modules/module_17500.c
+++ b/src/modules/module_17500.c
@@ -48,7 +48,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   char *jit_build_options = NULL;
 
   // -Wpass-failed=transform-warning
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_17600.c b/src/modules/module_17600.c
index 0c14c6505..9c58a09ec 100644
--- a/src/modules/module_17600.c
+++ b/src/modules/module_17600.c
@@ -48,7 +48,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   char *jit_build_options = NULL;
 
   // -Wpass-failed=transform-warning
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_17700.c b/src/modules/module_17700.c
index 698c4ccfb..f4ade222b 100644
--- a/src/modules/module_17700.c
+++ b/src/modules/module_17700.c
@@ -48,7 +48,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   char *jit_build_options = NULL;
 
   // -Wpass-failed=transform-warning
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_17800.c b/src/modules/module_17800.c
index 2499549b1..6b68526ac 100644
--- a/src/modules/module_17800.c
+++ b/src/modules/module_17800.c
@@ -48,7 +48,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   char *jit_build_options = NULL;
 
   // -Wpass-failed=transform-warning
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_17900.c b/src/modules/module_17900.c
index 0d37632bb..d11d902fa 100644
--- a/src/modules/module_17900.c
+++ b/src/modules/module_17900.c
@@ -48,7 +48,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   char *jit_build_options = NULL;
 
   // -Wpass-failed=transform-warning
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_18000.c b/src/modules/module_18000.c
index 7bc209743..f74c91160 100644
--- a/src/modules/module_18000.c
+++ b/src/modules/module_18000.c
@@ -48,7 +48,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   char *jit_build_options = NULL;
 
   // -Wpass-failed=transform-warning
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_18100.c b/src/modules/module_18100.c
index 2d4c5041e..69f2d4d32 100644
--- a/src/modules/module_18100.c
+++ b/src/modules/module_18100.c
@@ -53,7 +53,7 @@ int module_build_plain_postprocess (MAYBE_UNUSED const hashconfig_t *hashconfig,
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: Segmentation fault
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     return true;
   }
diff --git a/src/modules/module_18200.c b/src/modules/module_18200.c
index 22b1549dc..c188c8227 100644
--- a/src/modules/module_18200.c
+++ b/src/modules/module_18200.c
@@ -78,7 +78,7 @@ u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYB
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // amdgpu-pro-18.50-708488-ubuntu-18.04: CL_OUT_OF_RESOURCES
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
     {
diff --git a/src/modules/module_18300.c b/src/modules/module_18300.c
index 8602329d1..7c3624133 100644
--- a/src/modules/module_18300.c
+++ b/src/modules/module_18300.c
@@ -80,12 +80,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_18600.c b/src/modules/module_18600.c
index 62ee9cdb9..663717538 100644
--- a/src/modules/module_18600.c
+++ b/src/modules/module_18600.c
@@ -66,27 +66,48 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
+  // this uses some nice feedback effect.
+  // based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value
+  // which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result.
+  // therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1.
+
   u32 fixed_local_size = 0;
 
-  if (device_param->device_type & CL_DEVICE_TYPE_CPU)
+  if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
   {
     fixed_local_size = 1;
   }
   else
   {
-    if (user_options->kernel_threads_chgd == true)
-    {
-      fixed_local_size = user_options->kernel_threads;
-    }
-    else
-    {
-      u32 overhead = 0;
+    u32 overhead = 0;
 
-      if (device_param->device_vendor_id == VENDOR_ID_NV)
+    if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
+      // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
+      // on my development system. no clue where the 4 bytes are spent.
+      // I did some research on this and it seems to be related with the datatype.
+      // For example, if i used u8 instead, there's only 1 byte wasted.
+
+      if (device_param->is_opencl == true)
       {
         overhead = 4;
       }
+    }
 
+    if (user_options->kernel_threads_chgd == true)
+    {
+      fixed_local_size = user_options->kernel_threads;
+
+      // otherwise out-of-bound reads
+
+      if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead))
+      {
+        fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
+      }
+    }
+    else
+    {
       fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
     }
   }
@@ -123,7 +144,7 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
 bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   // OpenCL 1.2 pocl HSTR: pthread-x86_64-pc-linux-gnu-skylake: self-test failed
-  if (device_param->platform_vendor_id == VENDOR_ID_POCL)
+  if (device_param->opencl_platform_vendor_id == VENDOR_ID_POCL)
   {
     return true;
   }
diff --git a/src/modules/module_19100.c b/src/modules/module_19100.c
index 8214f7622..44f6275b4 100644
--- a/src/modules/module_19100.c
+++ b/src/modules/module_19100.c
@@ -61,12 +61,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if ((device_param->device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
+  if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/modules/module_19200.c b/src/modules/module_19200.c
index bd768e5f6..77a1c80bf 100644
--- a/src/modules/module_19200.c
+++ b/src/modules/module_19200.c
@@ -63,12 +63,12 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
-  if (device_param->device_vendor_id == VENDOR_ID_NV)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
 
-  if (device_param->device_vendor_id == VENDOR_ID_AMD)
+  if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
   {
     hc_asprintf (&jit_build_options, "-D NO_UNROLL");
   }
diff --git a/src/monitor.c b/src/monitor.c
index 2b942890f..6317d6f70 100644
--- a/src/monitor.c
+++ b/src/monitor.c
@@ -44,7 +44,7 @@ static int monitor (hashcat_ctx_t *hashcat_ctx)
 {
   hashes_t       *hashes        = hashcat_ctx->hashes;
   hwmon_ctx_t    *hwmon_ctx     = hashcat_ctx->hwmon_ctx;
-  opencl_ctx_t   *opencl_ctx    = hashcat_ctx->opencl_ctx;
+  backend_ctx_t  *backend_ctx   = hashcat_ctx->backend_ctx;
   restore_ctx_t  *restore_ctx   = hashcat_ctx->restore_ctx;
   status_ctx_t   *status_ctx    = hashcat_ctx->status_ctx;
   user_options_t *user_options  = hashcat_ctx->user_options;
@@ -114,33 +114,33 @@ static int monitor (hashcat_ctx_t *hashcat_ctx)
     {
       hc_thread_mutex_lock (status_ctx->mux_hwmon);
 
-      for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+      for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
       {
-        hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
         if (device_param->skipped == true) continue;
 
-        if ((opencl_ctx->devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+        if ((backend_ctx->devices_param[backend_devices_idx].opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
 
-        const int temperature = hm_get_temperature_with_device_id (hashcat_ctx, device_id);
+        const int temperature = hm_get_temperature_with_devices_idx (hashcat_ctx, backend_devices_idx);
 
         if (temperature > (int) user_options->hwmon_temp_abort)
         {
-          EVENT_DATA (EVENT_MONITOR_TEMP_ABORT, &device_id, sizeof (u32));
+          EVENT_DATA (EVENT_MONITOR_TEMP_ABORT, &backend_devices_idx, sizeof (int));
 
           myabort (hashcat_ctx);
         }
       }
 
-      for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+      for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
       {
-        hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
         if (device_param->skipped == true) continue;
 
         if (device_param->skipped_warning == true) continue;
 
-        const int rc_throttle = hm_get_throttle_with_device_id (hashcat_ctx, device_id);
+        const int rc_throttle = hm_get_throttle_with_devices_idx (hashcat_ctx, backend_devices_idx);
 
         if (rc_throttle == -1) continue;
 
@@ -148,9 +148,9 @@ static int monitor (hashcat_ctx_t *hashcat_ctx)
         {
           slowdown_warnings++;
 
-          if (slowdown_warnings == 1) EVENT_DATA (EVENT_MONITOR_THROTTLE1, &device_id, sizeof (u32));
-          if (slowdown_warnings == 2) EVENT_DATA (EVENT_MONITOR_THROTTLE2, &device_id, sizeof (u32));
-          if (slowdown_warnings == 3) EVENT_DATA (EVENT_MONITOR_THROTTLE3, &device_id, sizeof (u32));
+          if (slowdown_warnings == 1) EVENT_DATA (EVENT_MONITOR_THROTTLE1, &backend_devices_idx, sizeof (int));
+          if (slowdown_warnings == 2) EVENT_DATA (EVENT_MONITOR_THROTTLE2, &backend_devices_idx, sizeof (int));
+          if (slowdown_warnings == 3) EVENT_DATA (EVENT_MONITOR_THROTTLE3, &backend_devices_idx, sizeof (int));
         }
         else
         {
@@ -232,9 +232,9 @@ static int monitor (hashcat_ctx_t *hashcat_ctx)
 
       hc_thread_mutex_lock (status_ctx->mux_hwmon);
 
-      for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+      for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
       {
-        hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
         if (device_param->skipped == true) continue;
 
@@ -242,11 +242,11 @@ static int monitor (hashcat_ctx_t *hashcat_ctx)
 
         exec_cnt++;
 
-        const double exec = status_get_exec_msec_dev (hashcat_ctx, device_id);
+        const double exec = status_get_exec_msec_dev (hashcat_ctx, backend_devices_idx);
 
         exec_total += exec;
 
-        const int util = hm_get_utilization_with_device_id (hashcat_ctx, device_id);
+        const int util = hm_get_utilization_with_devices_idx (hashcat_ctx, backend_devices_idx);
 
         if (util == -1) continue;
 
diff --git a/src/mpsp.c b/src/mpsp.c
index a5072776e..c3a54e47d 100644
--- a/src/mpsp.c
+++ b/src/mpsp.c
@@ -11,7 +11,7 @@
 #include "logfile.h"
 #include "convert.h"
 #include "filehandling.h"
-#include "opencl.h"
+#include "backend.h"
 #include "shared.h"
 #include "ext_lzma.h"
 #include "mpsp.h"
@@ -1224,7 +1224,7 @@ int mask_ctx_update_loop (hashcat_ctx_t *hashcat_ctx)
           return -1;
         }
 
-        const int rc_update_mp = opencl_session_update_mp (hashcat_ctx);
+        const int rc_update_mp = backend_session_update_mp (hashcat_ctx);
 
         if (rc_update_mp == -1) return -1;
       }
@@ -1257,13 +1257,13 @@ int mask_ctx_update_loop (hashcat_ctx_t *hashcat_ctx)
           return -1;
         }
 
-        const int rc_update_mp = opencl_session_update_mp (hashcat_ctx);
+        const int rc_update_mp = backend_session_update_mp (hashcat_ctx);
 
         if (rc_update_mp == -1) return -1;
       }
     }
 
-    const int rc_update_combinator = opencl_session_update_combinator (hashcat_ctx);
+    const int rc_update_combinator = backend_session_update_combinator (hashcat_ctx);
 
     if (rc_update_combinator == -1) return -1;
   }
@@ -1378,7 +1378,7 @@ int mask_ctx_update_loop (hashcat_ctx_t *hashcat_ctx)
         return -1;
       }
 
-      const int rc_update_mp_rl = opencl_session_update_mp_rl (hashcat_ctx, css_cnt_lr[0], css_cnt_lr[1]);
+      const int rc_update_mp_rl = backend_session_update_mp_rl (hashcat_ctx, css_cnt_lr[0], css_cnt_lr[1]);
 
       if (rc_update_mp_rl == -1) return -1;
     }
@@ -1398,7 +1398,7 @@ int mask_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->example_hashes == true) return 0;
   if (user_options->left           == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
   if (user_options->show           == true) return 0;
   if (user_options->usage          == true) return 0;
   if (user_options->version        == true) return 0;
diff --git a/src/opencl.c b/src/opencl.c
deleted file mode 100644
index 4086a00f6..000000000
--- a/src/opencl.c
+++ /dev/null
@@ -1,7219 +0,0 @@
-/**
- * Author......: See docs/credits.txt
- * License.....: MIT
- */
-
-#include "common.h"
-#include "types.h"
-#include "memory.h"
-#include "locking.h"
-#include "thread.h"
-#include "timer.h"
-#include "tuningdb.h"
-#include "rp.h"
-#include "rp_cpu.h"
-#include "mpsp.h"
-#include "convert.h"
-#include "stdout.h"
-#include "filehandling.h"
-#include "wordlist.h"
-#include "shared.h"
-#include "hashes.h"
-#include "emu_inc_hash_md5.h"
-#include "event.h"
-#include "dynloader.h"
-#include "opencl.h"
-
-#if defined (__linux__)
-static const char *dri_card0_path = "/dev/dri/card0";
-
-static const char *drm_card0_vendor_path = "/sys/class/drm/card0/device/vendor";
-static const char *drm_card0_driver_path = "/sys/class/drm/card0/device/driver";
-#endif
-
-static const u32 full01 = 0x01010101;
-static const u32 full06 = 0x06060606;
-static const u32 full80 = 0x80808080;
-
-static double TARGET_MSEC_PROFILE[4] = { 2, 12, 96, 480 };
-
-static int ocl_check_dri (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx)
-{
-  #if defined (__linux__)
-
-  // This check makes sense only if we're not root
-
-  const uid_t uid = getuid ();
-
-  if (uid == 0) return 0;
-
-  // No GPU available! That's fine, so we don't need to check if we have access to it.
-
-  if (hc_path_exist (dri_card0_path) == false) return 0;
-
-  // Now we need to check if this an AMD vendor, because this is when the problems start
-
-  FILE *fd_drm = fopen (drm_card0_vendor_path, "rb");
-
-  if (fd_drm == NULL) return 0;
-
-  u32 vendor = 0;
-
-  if (fscanf (fd_drm, "0x%x", &vendor) != 1)
-  {
-    fclose (fd_drm);
-
-    return 0;
-  }
-
-  fclose (fd_drm);
-
-  if (vendor != 4098) return 0;
-
-  // Now the problem is only with AMDGPU-PRO, not with oldschool AMD driver
-
-  char buf[HCBUFSIZ_TINY];
-
-  const ssize_t len = readlink (drm_card0_driver_path, buf, HCBUFSIZ_TINY - 1);
-
-  if (len == -1) return 0;
-
-  buf[len] = 0;
-
-  if (strstr (buf, "amdgpu") == NULL) return 0;
-
-  // Now do the real check
-
-  FILE *fd_dri = fopen (dri_card0_path, "rb");
-
-  if (fd_dri == NULL)
-  {
-    event_log_error (hashcat_ctx, "Cannot access %s: %m.", dri_card0_path);
-
-    event_log_warning (hashcat_ctx, "This causes some drivers to crash when OpenCL is used!");
-    event_log_warning (hashcat_ctx, "Adding your user to the \"video\" group usually fixes this problem:");
-    event_log_warning (hashcat_ctx, "$ sudo usermod -a -G video $LOGNAME");
-    event_log_warning (hashcat_ctx, NULL);
-
-    return -1;
-  }
-
-  fclose (fd_dri);
-
-  #endif // __linux__
-
-  return 0;
-}
-
-static bool setup_opencl_platforms_filter (hashcat_ctx_t *hashcat_ctx, const char *opencl_platforms, u64 *out)
-{
-  u64 opencl_platforms_filter = 0;
-
-  if (opencl_platforms)
-  {
-    char *platforms = hcstrdup (opencl_platforms);
-
-    if (platforms == NULL) return false;
-
-    char *saveptr = NULL;
-
-    char *next = strtok_r (platforms, ",", &saveptr);
-
-    do
-    {
-      const int platform = (const int) strtol (next, NULL, 10);
-
-      if (platform <= 0 || platform >= 64)
-      {
-        event_log_error (hashcat_ctx, "Invalid OpenCL platform %d specified.", platform);
-
-        hcfree (platforms);
-
-        return false;
-      }
-
-      opencl_platforms_filter |= 1ULL << (platform - 1);
-
-    } while ((next = strtok_r ((char *) NULL, ",", &saveptr)) != NULL);
-
-    hcfree (platforms);
-  }
-  else
-  {
-    opencl_platforms_filter = -1ULL;
-  }
-
-  *out = opencl_platforms_filter;
-
-  return true;
-}
-
-static bool setup_devices_filter (hashcat_ctx_t *hashcat_ctx, const char *opencl_devices, u64 *out)
-{
-  u64 devices_filter = 0;
-
-  if (opencl_devices)
-  {
-    char *devices = hcstrdup (opencl_devices);
-
-    if (devices == NULL) return false;
-
-    char *saveptr = NULL;
-
-    char *next = strtok_r (devices, ",", &saveptr);
-
-    do
-    {
-      const int device_id = (const int) strtol (next, NULL, 10);
-
-      if ((device_id <= 0) || (device_id >= 64))
-      {
-        event_log_error (hashcat_ctx, "Invalid device_id %d specified.", device_id);
-
-        hcfree (devices);
-
-        return false;
-      }
-
-      devices_filter |= 1ULL << (device_id - 1);
-
-    } while ((next = strtok_r ((char *) NULL, ",", &saveptr)) != NULL);
-
-    hcfree (devices);
-  }
-  else
-  {
-    devices_filter = -1ULL;
-  }
-
-  *out = devices_filter;
-
-  return true;
-}
-
-static bool setup_device_types_filter (hashcat_ctx_t *hashcat_ctx, const char *opencl_device_types, cl_device_type *out)
-{
-  cl_device_type device_types_filter = 0;
-
-  if (opencl_device_types)
-  {
-    char *device_types = hcstrdup (opencl_device_types);
-
-    if (device_types == NULL) return false;
-
-    char *saveptr = NULL;
-
-    char *next = strtok_r (device_types, ",", &saveptr);
-
-    do
-    {
-      const int device_type = (const int) strtol (next, NULL, 10);
-
-      if (device_type < 1 || device_type > 3)
-      {
-        event_log_error (hashcat_ctx, "Invalid device_type %d specified.", device_type);
-
-        hcfree (device_types);
-
-        return false;
-      }
-
-      device_types_filter |= 1u << device_type;
-
-    } while ((next = strtok_r (NULL, ",", &saveptr)) != NULL);
-
-    hcfree (device_types);
-  }
-  else
-  {
-    // Do not use CPU by default, this often reduces GPU performance because
-    // the CPU is too busy to handle GPU synchronization
-
-    device_types_filter = CL_DEVICE_TYPE_ALL & ~CL_DEVICE_TYPE_CPU;
-  }
-
-  *out = device_types_filter;
-
-  return true;
-}
-
-static bool read_kernel_binary (hashcat_ctx_t *hashcat_ctx, const char *kernel_file, size_t *kernel_lengths, char **kernel_sources, const bool force_recompile)
-{
-  FILE *fp = fopen (kernel_file, "rb");
-
-  if (fp != NULL)
-  {
-    struct stat st;
-
-    if (stat (kernel_file, &st))
-    {
-      fclose (fp);
-
-      return false;
-    }
-
-    #define EXTRASZ 100
-
-    char *buf = (char *) hcmalloc (st.st_size + 1 + EXTRASZ);
-
-    size_t num_read = hc_fread (buf, sizeof (char), st.st_size, fp);
-
-    fclose (fp);
-
-    if (num_read != (size_t) st.st_size)
-    {
-      event_log_error (hashcat_ctx, "%s: %s", kernel_file, strerror (errno));
-
-      hcfree (buf);
-
-      return false;
-    }
-
-    buf[st.st_size] = 0;
-
-    if (force_recompile == true)
-    {
-      // this adds some hopefully unique data to the opencl kernel source
-      // the effect should be that opencl kernel compiler caching see this as new "uncached" source
-      // we have to do this since they do not check for the changes only in the #include source
-
-      time_t tlog = time (NULL);
-
-      const int extra_len = snprintf (buf + st.st_size, EXTRASZ, "\n//%u\n", (u32) tlog);
-
-      st.st_size += extra_len;
-    }
-
-    kernel_lengths[0] = (size_t) st.st_size;
-
-    kernel_sources[0] = buf;
-  }
-  else
-  {
-    event_log_error (hashcat_ctx, "%s: %s", kernel_file, strerror (errno));
-
-    return false;
-  }
-
-  return true;
-}
-
-static bool write_kernel_binary (hashcat_ctx_t *hashcat_ctx, char *kernel_file, char *binary, size_t binary_size)
-{
-  if (binary_size > 0)
-  {
-    FILE *fp = fopen (kernel_file, "wb");
-
-    if (fp == NULL)
-    {
-      event_log_error (hashcat_ctx, "%s: %s", kernel_file, strerror (errno));
-
-      return false;
-    }
-
-    if (lock_file (fp) == -1)
-    {
-      fclose (fp);
-
-      event_log_error (hashcat_ctx, "%s: %s", kernel_file, strerror (errno));
-
-      return false;
-    }
-
-    hc_fwrite (binary, sizeof (char), binary_size, fp);
-
-    fflush (fp);
-
-    fclose (fp);
-  }
-
-  return true;
-}
-
-static bool test_instruction (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_device_id device, const char *kernel_buf)
-{
-  int CL_rc;
-
-  cl_program program;
-
-  CL_rc = hc_clCreateProgramWithSource (hashcat_ctx, context, 1, &kernel_buf, NULL, &program);
-
-  if (CL_rc == -1) return false;
-
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  // LLVM seems to write an error message (if there's an error) directly to stderr
-  // and not (as supposted to) into buffer for later request using clGetProgramBuildInfo()
-
-  #ifndef DEBUG
-  #ifndef _WIN
-  fflush (stderr);
-  int bak = dup (2);
-  int tmp = open ("/dev/null", O_WRONLY);
-  dup2 (tmp, 2);
-  close (tmp);
-  #endif
-  #endif
-
-  CL_rc = ocl->clBuildProgram (program, 1, &device, "-Werror", NULL, NULL); // do not use the wrapper to avoid the error message
-
-  #ifndef DEBUG
-  #ifndef _WIN
-  fflush (stderr);
-  dup2 (bak, 2);
-  close (bak);
-  #endif
-  #endif
-
-  if (CL_rc != CL_SUCCESS)
-  {
-    #if defined (DEBUG)
-
-    event_log_error (hashcat_ctx, "clBuildProgram(): %s", val2cstr_cl (CL_rc));
-
-    size_t build_log_size = 0;
-
-    hc_clGetProgramBuildInfo (hashcat_ctx, program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
-
-    char *build_log = (char *) hcmalloc (build_log_size + 1);
-
-    hc_clGetProgramBuildInfo (hashcat_ctx, program, device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
-
-    build_log[build_log_size] = 0;
-
-    puts (build_log);
-
-    hcfree (build_log);
-
-    #endif
-
-    hc_clReleaseProgram (hashcat_ctx, program);
-
-    return false;
-  }
-
-  CL_rc = hc_clReleaseProgram (hashcat_ctx, program);
-
-  if (CL_rc == -1) return false;
-
-  return true;
-}
-
-void generate_source_kernel_filename (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *shared_dir, char *source_file)
-{
-  if (opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-  {
-    if (attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-    {
-      if (slow_candidates == true)
-      {
-        snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-optimized.cl", shared_dir, (int) kern_type);
-      }
-      else
-      {
-        if (attack_kern == ATTACK_KERN_STRAIGHT)
-          snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-optimized.cl", shared_dir, (int) kern_type);
-        else if (attack_kern == ATTACK_KERN_COMBI)
-          snprintf (source_file, 255, "%s/OpenCL/m%05d_a1-optimized.cl", shared_dir, (int) kern_type);
-        else if (attack_kern == ATTACK_KERN_BF)
-          snprintf (source_file, 255, "%s/OpenCL/m%05d_a3-optimized.cl", shared_dir, (int) kern_type);
-        else if (attack_kern == ATTACK_KERN_NONE)
-          snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-optimized.cl", shared_dir, (int) kern_type);
-      }
-    }
-    else
-    {
-      snprintf (source_file, 255, "%s/OpenCL/m%05d-optimized.cl", shared_dir, (int) kern_type);
-    }
-  }
-  else
-  {
-    if (attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-    {
-      if (slow_candidates == true)
-      {
-        snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-pure.cl", shared_dir, (int) kern_type);
-      }
-      else
-      {
-        if (attack_kern == ATTACK_KERN_STRAIGHT)
-          snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-pure.cl", shared_dir, (int) kern_type);
-        else if (attack_kern == ATTACK_KERN_COMBI)
-          snprintf (source_file, 255, "%s/OpenCL/m%05d_a1-pure.cl", shared_dir, (int) kern_type);
-        else if (attack_kern == ATTACK_KERN_BF)
-          snprintf (source_file, 255, "%s/OpenCL/m%05d_a3-pure.cl", shared_dir, (int) kern_type);
-        else if (attack_kern == ATTACK_KERN_NONE)
-          snprintf (source_file, 255, "%s/OpenCL/m%05d_a0-pure.cl", shared_dir, (int) kern_type);
-      }
-    }
-    else
-    {
-      snprintf (source_file, 255, "%s/OpenCL/m%05d-pure.cl", shared_dir, (int) kern_type);
-    }
-  }
-}
-
-void generate_cached_kernel_filename (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *profile_dir, const char *device_name_chksum, char *cached_file)
-{
-  if (opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-  {
-    if (attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-    {
-      if (slow_candidates == true)
-      {
-        snprintf (cached_file, 255, "%s/kernels/m%05d_a0-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-      }
-      else
-      {
-        if (attack_kern == ATTACK_KERN_STRAIGHT)
-          snprintf (cached_file, 255, "%s/kernels/m%05d_a0-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-        else if (attack_kern == ATTACK_KERN_COMBI)
-          snprintf (cached_file, 255, "%s/kernels/m%05d_a1-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-        else if (attack_kern == ATTACK_KERN_BF)
-          snprintf (cached_file, 255, "%s/kernels/m%05d_a3-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-        else if (attack_kern == ATTACK_KERN_NONE)
-          snprintf (cached_file, 255, "%s/kernels/m%05d_a0-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-      }
-    }
-    else
-    {
-      snprintf (cached_file, 255, "%s/kernels/m%05d-optimized.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-    }
-  }
-  else
-  {
-    if (attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-    {
-      if (slow_candidates == true)
-      {
-        snprintf (cached_file, 255, "%s/kernels/m%05d_a0-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-      }
-      else
-      {
-        if (attack_kern == ATTACK_KERN_STRAIGHT)
-          snprintf (cached_file, 255, "%s/kernels/m%05d_a0-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-        else if (attack_kern == ATTACK_KERN_COMBI)
-          snprintf (cached_file, 255, "%s/kernels/m%05d_a1-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-        else if (attack_kern == ATTACK_KERN_BF)
-          snprintf (cached_file, 255, "%s/kernels/m%05d_a3-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-        else if (attack_kern == ATTACK_KERN_NONE)
-          snprintf (cached_file, 255, "%s/kernels/m%05d_a0-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-      }
-    }
-    else
-    {
-      snprintf (cached_file, 255, "%s/kernels/m%05d-pure.%s.kernel", profile_dir, (int) kern_type, device_name_chksum);
-    }
-  }
-}
-
-void generate_source_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *shared_dir, char *source_file)
-{
-  if ((opti_type & OPTI_TYPE_BRUTE_FORCE) && (opts_type & OPTS_TYPE_PT_GENERATE_BE))
-  {
-    snprintf (source_file, 255, "%s/OpenCL/markov_be.cl", shared_dir);
-  }
-  else
-  {
-    snprintf (source_file, 255, "%s/OpenCL/markov_le.cl", shared_dir);
-  }
-}
-
-void generate_cached_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *profile_dir, const char *device_name_chksum_amp_mp, char *cached_file)
-{
-  if ((opti_type & OPTI_TYPE_BRUTE_FORCE) && (opts_type & OPTS_TYPE_PT_GENERATE_BE))
-  {
-    snprintf (cached_file, 255, "%s/kernels/markov_be.%s.kernel", profile_dir, device_name_chksum_amp_mp);
-  }
-  else
-  {
-    snprintf (cached_file, 255, "%s/kernels/markov_le.%s.kernel", profile_dir, device_name_chksum_amp_mp);
-  }
-}
-
-void generate_source_kernel_amp_filename (const u32 attack_kern, char *shared_dir, char *source_file)
-{
-  snprintf (source_file, 255, "%s/OpenCL/amp_a%u.cl", shared_dir, attack_kern);
-}
-
-void generate_cached_kernel_amp_filename (const u32 attack_kern, char *profile_dir, const char *device_name_chksum_amp_mp, char *cached_file)
-{
-  snprintf (cached_file, 255, "%s/kernels/amp_a%u.%s.kernel", profile_dir, attack_kern, device_name_chksum_amp_mp);
-}
-
-int ocl_init (hashcat_ctx_t *hashcat_ctx)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  memset (ocl, 0, sizeof (OCL_PTR));
-
-  #if   defined (_WIN)
-  ocl->lib = hc_dlopen ("OpenCL");
-  #elif defined (__APPLE__)
-  ocl->lib = hc_dlopen ("/System/Library/Frameworks/OpenCL.framework/OpenCL");
-  #elif defined (__CYGWIN__)
-  ocl->lib = hc_dlopen ("opencl.dll");
-
-  if (ocl->lib == NULL) ocl->lib = hc_dlopen ("cygOpenCL-1.dll");
-  #else
-  ocl->lib = hc_dlopen ("libOpenCL.so");
-
-  if (ocl->lib == NULL) ocl->lib = hc_dlopen ("libOpenCL.so.1");
-  #endif
-
-  if (ocl->lib == NULL)
-  {
-    event_log_error (hashcat_ctx, "Cannot find an OpenCL ICD loader library.");
-
-    event_log_warning (hashcat_ctx, "You are probably missing the native OpenCL runtime or driver for your platform.");
-    event_log_warning (hashcat_ctx, NULL);
-
-    #if defined (__linux__)
-    event_log_warning (hashcat_ctx, "* AMD GPUs on Linux require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"RadeonOpenCompute (ROCm)\" Software Platform (1.6.180 or later)");
-    #elif defined (_WIN)
-    event_log_warning (hashcat_ctx, "* AMD GPUs on Windows require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"AMD Radeon Software Crimson Edition\" (15.12 or later)");
-    #endif
-
-    event_log_warning (hashcat_ctx, "* Intel CPUs require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"OpenCL Runtime for Intel Core and Intel Xeon Processors\" (16.1.1 or later)");
-
-    #if defined (__linux__)
-    event_log_warning (hashcat_ctx, "* Intel GPUs on Linux require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"OpenCL 2.0 GPU Driver Package for Linux\" (2.0 or later)");
-    #elif defined (_WIN)
-    event_log_warning (hashcat_ctx, "* Intel GPUs on Windows require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"OpenCL Driver for Intel Iris and Intel HD Graphics\"");
-    #endif
-
-    event_log_warning (hashcat_ctx, "* NVIDIA GPUs require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"NVIDIA Driver\" (418.56 or later)");
-    event_log_warning (hashcat_ctx, NULL);
-
-    return -1;
-  }
-
-  HC_LOAD_FUNC(ocl, clBuildProgram, OCL_CLBUILDPROGRAM, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clCreateBuffer, OCL_CLCREATEBUFFER, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clCreateCommandQueue, OCL_CLCREATECOMMANDQUEUE, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clCreateContext, OCL_CLCREATECONTEXT, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clCreateKernel, OCL_CLCREATEKERNEL, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clCreateProgramWithBinary, OCL_CLCREATEPROGRAMWITHBINARY, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clCreateProgramWithSource, OCL_CLCREATEPROGRAMWITHSOURCE, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clEnqueueCopyBuffer, OCL_CLENQUEUECOPYBUFFER, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clEnqueueMapBuffer, OCL_CLENQUEUEMAPBUFFER, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clEnqueueNDRangeKernel, OCL_CLENQUEUENDRANGEKERNEL, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clEnqueueReadBuffer, OCL_CLENQUEUEREADBUFFER, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clEnqueueUnmapMemObject, OCL_CLENQUEUEUNMAPMEMOBJECT, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clEnqueueWriteBuffer, OCL_CLENQUEUEWRITEBUFFER, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clFinish, OCL_CLFINISH, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clFlush, OCL_CLFLUSH, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clGetDeviceIDs, OCL_CLGETDEVICEIDS, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clGetDeviceInfo, OCL_CLGETDEVICEINFO, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clGetEventInfo, OCL_CLGETEVENTINFO, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clGetKernelWorkGroupInfo, OCL_CLGETKERNELWORKGROUPINFO, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clGetPlatformIDs, OCL_CLGETPLATFORMIDS, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clGetPlatformInfo, OCL_CLGETPLATFORMINFO, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clGetProgramBuildInfo, OCL_CLGETPROGRAMBUILDINFO, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clGetProgramInfo, OCL_CLGETPROGRAMINFO, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clReleaseCommandQueue, OCL_CLRELEASECOMMANDQUEUE, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clReleaseContext, OCL_CLRELEASECONTEXT, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clReleaseKernel, OCL_CLRELEASEKERNEL, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clReleaseMemObject, OCL_CLRELEASEMEMOBJECT, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clReleaseProgram, OCL_CLRELEASEPROGRAM, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clSetKernelArg, OCL_CLSETKERNELARG, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clWaitForEvents, OCL_CLWAITFOREVENTS, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clGetEventProfilingInfo, OCL_CLGETEVENTPROFILINGINFO, OpenCL, 1)
-  HC_LOAD_FUNC(ocl, clReleaseEvent, OCL_CLRELEASEEVENT, OpenCL, 1)
-
-  return 0;
-}
-
-void ocl_close (hashcat_ctx_t *hashcat_ctx)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  if (ocl)
-  {
-    if (ocl->lib)
-    {
-      hc_dlclose (ocl->lib);
-    }
-
-    hcfree (opencl_ctx->ocl);
-  }
-}
-
-int hc_clEnqueueNDRangeKernel (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clEnqueueNDRangeKernel (command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clEnqueueNDRangeKernel(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clGetEventInfo (hashcat_ctx_t *hashcat_ctx, cl_event event, cl_event_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clGetEventInfo (event, param_name, param_value_size, param_value, param_value_size_ret);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clGetEventInfo(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clFlush (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clFlush (command_queue);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clFlush(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clFinish (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clFinish (command_queue);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clFinish(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clSetKernelArg (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clSetKernelArg (kernel, arg_index, arg_size, arg_value);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clSetKernelArg(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clEnqueueWriteBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t size, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clEnqueueWriteBuffer (command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clEnqueueWriteBuffer(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clEnqueueCopyBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clEnqueueCopyBuffer (command_queue, src_buffer, dst_buffer, src_offset, dst_offset, size, num_events_in_wait_list, event_wait_list, event);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clEnqueueCopyBuffer(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clEnqueueReadBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t size, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clEnqueueReadBuffer (command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clEnqueueReadBuffer(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clGetPlatformIDs (hashcat_ctx_t *hashcat_ctx, cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clGetPlatformIDs (num_entries, platforms, num_platforms);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clGetPlatformIDs(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clGetPlatformInfo (hashcat_ctx_t *hashcat_ctx, cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clGetPlatformInfo (platform, param_name, param_value_size, param_value, param_value_size_ret);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clGetPlatformInfo(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clGetDeviceIDs (hashcat_ctx_t *hashcat_ctx, cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clGetDeviceIDs (platform, device_type, num_entries, devices, num_devices);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clGetDeviceIDs(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clGetDeviceInfo (hashcat_ctx_t *hashcat_ctx, cl_device_id device, cl_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clGetDeviceInfo (device, param_name, param_value_size, param_value, param_value_size_ret);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clGetDeviceInfo(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clCreateContext (hashcat_ctx_t *hashcat_ctx, const cl_context_properties *properties, cl_uint num_devices, const cl_device_id *devices, void (CL_CALLBACK *pfn_notify) (const char *errinfo, const void *private_info, size_t cb, void *user_data), void *user_data, cl_context *context)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  cl_int CL_err;
-
-  *context = ocl->clCreateContext (properties, num_devices, devices, pfn_notify, user_data, &CL_err);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clCreateContext(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clCreateCommandQueue (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_device_id device, cl_command_queue_properties properties, cl_command_queue *command_queue)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  cl_int CL_err;
-
-  *command_queue = ocl->clCreateCommandQueue (context, device, properties, &CL_err);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clCreateCommandQueue(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clCreateBuffer (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_mem *mem)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  cl_int CL_err;
-
-  *mem = ocl->clCreateBuffer (context, flags, size, host_ptr, &CL_err);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clCreateBuffer(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clCreateProgramWithSource (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_program *program)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  cl_int CL_err;
-
-  *program = ocl->clCreateProgramWithSource (context, count, strings, lengths, &CL_err);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clCreateProgramWithSource(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clCreateProgramWithBinary (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_uint num_devices, const cl_device_id *device_list, const size_t *lengths, const unsigned char **binaries, cl_int *binary_status, cl_program *program)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  cl_int CL_err;
-
-  *program = ocl->clCreateProgramWithBinary (context, num_devices, device_list, lengths, (const unsigned char **) binaries, binary_status, &CL_err);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clCreateProgramWithBinary(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clBuildProgram (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clBuildProgram (program, num_devices, device_list, options, pfn_notify, user_data);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clBuildProgram(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clCreateKernel (hashcat_ctx_t *hashcat_ctx, cl_program program, const char *kernel_name, cl_kernel *kernel)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  cl_int CL_err;
-
-  *kernel = ocl->clCreateKernel (program, kernel_name, &CL_err);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clCreateKernel(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clReleaseMemObject (hashcat_ctx_t *hashcat_ctx, cl_mem mem)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clReleaseMemObject (mem);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clReleaseMemObject(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clReleaseKernel (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clReleaseKernel (kernel);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clReleaseKernel(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clReleaseProgram (hashcat_ctx_t *hashcat_ctx, cl_program program)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clReleaseProgram (program);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clReleaseProgram(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clReleaseCommandQueue (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clReleaseCommandQueue (command_queue);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clReleaseCommandQueue(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clReleaseContext (hashcat_ctx_t *hashcat_ctx, cl_context context)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clReleaseContext (context);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clReleaseContext(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clEnqueueMapBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event, void **buf)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  cl_int CL_err;
-
-  *buf = ocl->clEnqueueMapBuffer (command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, &CL_err);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clEnqueueMapBuffer(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clEnqueueUnmapMemObject (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clEnqueueUnmapMemObject (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clEnqueueUnmapMemObject(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clGetKernelWorkGroupInfo (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clGetKernelWorkGroupInfo (kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clGetKernelWorkGroupInfo(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clGetProgramBuildInfo (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_device_id device, cl_program_build_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clGetProgramBuildInfo (program, device, param_name, param_value_size, param_value, param_value_size_ret);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clGetProgramBuildInfo(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clGetProgramInfo (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_program_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clGetProgramInfo (program, param_name, param_value_size, param_value, param_value_size_ret);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clGetProgramInfo(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clWaitForEvents (hashcat_ctx_t *hashcat_ctx, cl_uint num_events, const cl_event *event_list)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clWaitForEvents (num_events, event_list);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clWaitForEvents(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clGetEventProfilingInfo (hashcat_ctx_t *hashcat_ctx, cl_event event, cl_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clGetEventProfilingInfo (event, param_name, param_value_size, param_value, param_value_size_ret);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clGetEventProfilingInfo(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_clReleaseEvent (hashcat_ctx_t *hashcat_ctx, cl_event event)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  OCL_PTR *ocl = opencl_ctx->ocl;
-
-  const cl_int CL_err = ocl->clReleaseEvent (event);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clReleaseEvent(): %s", val2cstr_cl (CL_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 gidd, pw_t *pw)
-{
-  pw_idx_t pw_idx;
-
-  int CL_rc;
-
-  CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_idx, CL_TRUE, gidd * sizeof (pw_idx_t), sizeof (pw_idx_t), &pw_idx, 0, NULL, NULL);
-
-  if (CL_rc == -1) return -1;
-
-  const u32 off = pw_idx.off;
-  const u32 cnt = pw_idx.cnt;
-  const u32 len = pw_idx.len;
-
-  if (cnt > 0)
-  {
-    CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_comp_buf, CL_TRUE, off * sizeof (u32), cnt * sizeof (u32), pw->i, 0, NULL, NULL);
-
-    if (CL_rc == -1) return -1;
-  }
-
-  for (u32 i = cnt; i < 64; i++)
-  {
-    pw->i[i] = 0;
-  }
-
-  pw->pw_len = len;
-
-  return 0;
-}
-
-int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 highest_pw_len, const u64 pws_cnt, const u32 fast_iteration, const u32 salt_pos)
-{
-  hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
-  hashes_t       *hashes       = hashcat_ctx->hashes;
-  module_ctx_t   *module_ctx   = hashcat_ctx->module_ctx;
-  status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
-  user_options_t *user_options = hashcat_ctx->user_options;
-
-  if (user_options->stdout_flag == true)
-  {
-    return process_stdout (hashcat_ctx, device_param, pws_cnt);
-  }
-
-  int CL_rc;
-
-  if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-  {
-    if (user_options->attack_mode == ATTACK_MODE_BF)
-    {
-      if (user_options->slow_candidates == true)
-      {
-      }
-      else
-      {
-        if (hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE)
-        {
-          const u32 size_tm = 32 * sizeof (bs_word_t);
-
-          CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_tm_c, size_tm);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = run_kernel_tm (hashcat_ctx, device_param);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->command_queue, device_param->d_tm_c, device_param->d_bfs_c, 0, 0, size_tm, 0, NULL, NULL);
-
-          if (CL_rc == -1) return -1;
-        }
-      }
-    }
-
-    if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-    {
-      if (highest_pw_len < 16)
-      {
-        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_1, pws_cnt, true, fast_iteration);
-
-        if (CL_rc == -1) return -1;
-      }
-      else if (highest_pw_len < 32)
-      {
-        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_cnt, true, fast_iteration);
-
-        if (CL_rc == -1) return -1;
-      }
-      else
-      {
-        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_3, pws_cnt, true, fast_iteration);
-
-        if (CL_rc == -1) return -1;
-      }
-    }
-    else
-    {
-      CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_4, pws_cnt, true, fast_iteration);
-
-      if (CL_rc == -1) return -1;
-    }
-  }
-  else
-  {
-    bool run_init = true;
-    bool run_loop = true;
-    bool run_comp = true;
-
-    if (run_init == true)
-    {
-      CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_amp_buf, device_param->d_pws_buf, 0, 0, pws_cnt * sizeof (pw_t), 0, NULL, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      if (user_options->slow_candidates == true)
-      {
-      }
-      else
-      {
-        CL_rc = run_kernel_amp (hashcat_ctx, device_param, pws_cnt);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_1, pws_cnt, false, 0);
-
-      if (CL_rc == -1) return -1;
-
-      if (hashconfig->opts_type & OPTS_TYPE_HOOK12)
-      {
-        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_12, pws_cnt, false, 0);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
-
-        if (CL_rc == -1) return -1;
-
-        module_ctx->module_hook12 (device_param, hashes->hook_salts_buf, salt_pos, pws_cnt);
-
-        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
-
-        if (CL_rc == -1) return -1;
-      }
-    }
-
-    if (run_loop == true)
-    {
-      u32 iter = hashes->salts_buf[salt_pos].salt_iter;
-
-      u32 loop_step = device_param->kernel_loops;
-
-      for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
-      {
-        u32 loop_left = iter - loop_pos;
-
-        loop_left = MIN (loop_left, loop_step);
-
-        device_param->kernel_params_buf32[28] = loop_pos;
-        device_param->kernel_params_buf32[29] = loop_left;
-
-        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_cnt, true, slow_iteration);
-
-        if (CL_rc == -1) return -1;
-
-        //bug?
-        //while (status_ctx->run_thread_level2 == false) break;
-        if (status_ctx->run_thread_level2 == false) break;
-
-        /**
-         * speed
-         */
-
-        const float iter_part = (float) (loop_pos + loop_left) / iter;
-
-        const u64 perf_sum_all = (u64) (pws_cnt * iter_part);
-
-        double speed_msec = hc_timer_get (device_param->timer_speed);
-
-        const u32 speed_pos = device_param->speed_pos;
-
-        device_param->speed_cnt[speed_pos] = perf_sum_all;
-
-        device_param->speed_msec[speed_pos] = speed_msec;
-
-        if (user_options->speed_only == true)
-        {
-          if (speed_msec > 4000)
-          {
-            device_param->outerloop_multi *= (double) iter / (double) (loop_pos + loop_left);
-
-            device_param->speed_pos = 1;
-
-            device_param->speed_only_finish = true;
-
-            return 0;
-          }
-        }
-      }
-
-      if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
-      {
-        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_23, pws_cnt, false, 0);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
-
-        if (CL_rc == -1) return -1;
-
-        module_ctx->module_hook23 (device_param, hashes->hook_salts_buf, salt_pos, pws_cnt);
-
-        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
-
-        if (CL_rc == -1) return -1;
-      }
-    }
-
-    // init2 and loop2 are kind of special, we use run_loop for them, too
-
-    if (run_loop == true)
-    {
-      // note: they also do not influence the performance screen
-      // in case you want to use this, this cane make sense only if your input data comes out of tmps[]
-
-      if (hashconfig->opts_type & OPTS_TYPE_INIT2)
-      {
-        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_INIT2, pws_cnt, false, 0);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
-      {
-        u32 iter = hashes->salts_buf[salt_pos].salt_iter2;
-
-        u32 loop_step = device_param->kernel_loops;
-
-        for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
-        {
-          u32 loop_left = iter - loop_pos;
-
-          loop_left = MIN (loop_left, loop_step);
-
-          device_param->kernel_params_buf32[28] = loop_pos;
-          device_param->kernel_params_buf32[29] = loop_left;
-
-          CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2, pws_cnt, true, slow_iteration);
-
-          if (CL_rc == -1) return -1;
-
-          //bug?
-          //while (status_ctx->run_thread_level2 == false) break;
-          if (status_ctx->run_thread_level2 == false) break;
-        }
-      }
-    }
-
-    if (run_comp == true)
-    {
-      if (hashconfig->opts_type & OPTS_TYPE_DEEP_COMP_KERNEL)
-      {
-        const u32 loops_cnt = hashes->salts_buf[salt_pos].digests_cnt;
-
-        for (u32 loops_pos = 0; loops_pos < loops_cnt; loops_pos++)
-        {
-          device_param->kernel_params_buf32[28] = loops_pos;
-          device_param->kernel_params_buf32[29] = loops_cnt;
-
-          const u32 deep_comp_kernel = module_ctx->module_deep_comp_kernel (hashes, salt_pos, loops_pos);
-
-          CL_rc = run_kernel (hashcat_ctx, device_param, deep_comp_kernel, pws_cnt, false, 0);
-
-          if (CL_rc == -1) return -1;
-
-          if (status_ctx->run_thread_level2 == false) break;
-        }
-      }
-      else
-      {
-        CL_rc = run_kernel (hashcat_ctx, device_param, KERN_RUN_3, pws_cnt, false, 0);
-
-        if (CL_rc == -1) return -1;
-      }
-    }
-  }
-
-  return 0;
-}
-
-void rebuild_pws_compressed_append (hc_device_param_t *device_param, const u64 pws_cnt, const u8 chr)
-{
-  // this function is used if we have to modify the compressed pws buffer in order to
-  // append some data to each password candidate
-
-  u32      *tmp_pws_comp = (u32 *)      hcmalloc (device_param->size_pws_comp);
-  pw_idx_t *tmp_pws_idx  = (pw_idx_t *) hcmalloc (device_param->size_pws_idx);
-
-  for (u32 i = 0; i < pws_cnt; i++)
-  {
-    pw_idx_t *pw_idx_src = device_param->pws_idx + i;
-    pw_idx_t *pw_idx_dst = tmp_pws_idx + i;
-
-    const u32 src_off = pw_idx_src->off;
-    const u32 src_len = pw_idx_src->len;
-
-    u8 buf[256];
-
-    memcpy (buf, device_param->pws_comp + src_off, src_len);
-
-    buf[src_len] = chr;
-
-    const u32 dst_len = src_len + 1;
-
-    const u32 dst_pw_len4 = (dst_len + 3) & ~3; // round up to multiple of 4
-
-    const u32 dst_pw_len4_cnt = dst_pw_len4 / 4;
-
-    pw_idx_dst->cnt = dst_pw_len4_cnt;
-    pw_idx_dst->len = src_len; // this is intenionally! src_len can not be dst_len, we dont want the kernel to think 0x80 is part of the password
-
-    u8 *dst = (u8 *) (tmp_pws_comp + pw_idx_dst->off);
-
-    memcpy (dst, buf, dst_len);
-
-    memset (dst + dst_len, 0, dst_pw_len4 - dst_len);
-
-    // prepare next element
-
-    pw_idx_t *pw_idx_dst_next = pw_idx_dst + 1;
-
-    pw_idx_dst_next->off = pw_idx_dst->off + pw_idx_dst->cnt;
-  }
-
-  memcpy (device_param->pws_comp, tmp_pws_comp, device_param->size_pws_comp);
-  memcpy (device_param->pws_idx,  tmp_pws_idx,  device_param->size_pws_idx);
-
-  hcfree (tmp_pws_comp);
-  hcfree (tmp_pws_idx);
-}
-
-int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num, const u32 event_update, const u32 iteration)
-{
-  const hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
-  const status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
-  const user_options_t *user_options = hashcat_ctx->user_options;
-
-  u64 num_elements = num;
-
-  device_param->kernel_params_buf64[34] = num;
-
-  u64       kernel_threads = 0;
-  cl_kernel kernel = NULL;
-
-  switch (kern_run)
-  {
-    case KERN_RUN_1:
-      kernel          = device_param->kernel1;
-      kernel_threads  = device_param->kernel_wgs1;
-      break;
-    case KERN_RUN_12:
-      kernel          = device_param->kernel12;
-      kernel_threads  = device_param->kernel_wgs12;
-      break;
-    case KERN_RUN_2:
-      kernel          = device_param->kernel2;
-      kernel_threads  = device_param->kernel_wgs2;
-      break;
-    case KERN_RUN_23:
-      kernel          = device_param->kernel23;
-      kernel_threads  = device_param->kernel_wgs23;
-      break;
-    case KERN_RUN_3:
-      kernel          = device_param->kernel3;
-      kernel_threads  = device_param->kernel_wgs3;
-      break;
-    case KERN_RUN_4:
-      kernel          = device_param->kernel4;
-      kernel_threads  = device_param->kernel_wgs4;
-      break;
-    case KERN_RUN_INIT2:
-      kernel          = device_param->kernel_init2;
-      kernel_threads  = device_param->kernel_wgs_init2;
-      break;
-    case KERN_RUN_LOOP2:
-      kernel          = device_param->kernel_loop2;
-      kernel_threads  = device_param->kernel_wgs_loop2;
-      break;
-    case KERN_RUN_AUX1:
-      kernel          = device_param->kernel_aux1;
-      kernel_threads  = device_param->kernel_wgs_aux1;
-      break;
-    case KERN_RUN_AUX2:
-      kernel          = device_param->kernel_aux2;
-      kernel_threads  = device_param->kernel_wgs_aux2;
-      break;
-    case KERN_RUN_AUX3:
-      kernel          = device_param->kernel_aux3;
-      kernel_threads  = device_param->kernel_wgs_aux3;
-      break;
-    case KERN_RUN_AUX4:
-      kernel          = device_param->kernel_aux4;
-      kernel_threads  = device_param->kernel_wgs_aux4;
-      break;
-    default:
-      event_log_error (hashcat_ctx, "Invalid kernel specified.");
-      return -1;
-  }
-
-  kernel_threads = MIN (kernel_threads, device_param->kernel_threads);
-
-  // kernel_threads = power_of_two_floor_32 (kernel_threads);
-
-  num_elements = round_up_multiple_64 (num_elements, kernel_threads);
-
-  int CL_rc;
-
-  for (u32 i = 0; i <= 23; i++)
-  {
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_mem), device_param->kernel_params[i]);
-
-    if (CL_rc == -1) return -1;
-  }
-
-  for (u32 i = 24; i <= 33; i++)
-  {
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_uint), device_param->kernel_params[i]);
-
-    if (CL_rc == -1) return -1;
-  }
-
-  for (u32 i = 34; i <= 34; i++)
-  {
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]);
-
-    if (CL_rc == -1) return -1;
-  }
-
-  cl_event event;
-
-  if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF))
-  {
-    const size_t global_work_size[3] = { num_elements,  32, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
-
-    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &event);
-
-    if (CL_rc == -1) return -1;
-  }
-  else
-  {
-    if (kern_run == KERN_RUN_1)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
-      {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
-      }
-    }
-    else if (kern_run == KERN_RUN_2)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
-      {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
-      }
-    }
-    else if (kern_run == KERN_RUN_3)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
-      {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
-      }
-    }
-
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
-
-    const size_t global_work_size[3] = { num_elements,   1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
-
-    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &event);
-
-    if (CL_rc == -1) return -1;
-  }
-
-  CL_rc = hc_clFlush (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  // spin damper section
-
-  const u32 iterationm = iteration % EXPECTED_ITERATIONS;
-
-  cl_int event_status;
-
-  size_t param_value_size_ret;
-
-  CL_rc = hc_clGetEventInfo (hashcat_ctx, event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (event_status), &event_status, &param_value_size_ret);
-
-  if (CL_rc == -1) return -1;
-
-  if (device_param->spin_damp > 0)
-  {
-    double spin_total = device_param->spin_damp;
-
-    while (event_status != CL_COMPLETE)
-    {
-      if (status_ctx->devices_status == STATUS_RUNNING)
-      {
-        switch (kern_run)
-        {
-          case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]      * device_param->spin_damp)); break;
-          case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]      * device_param->spin_damp)); break;
-          case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]      * device_param->spin_damp)); break;
-          case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]      * device_param->spin_damp)); break;
-          case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break;
-          case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break;
-          case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]  * device_param->spin_damp)); break;
-          case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]  * device_param->spin_damp)); break;
-          case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]  * device_param->spin_damp)); break;
-          case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]  * device_param->spin_damp)); break;
-        }
-      }
-      else
-      {
-        // we were told to be nice
-
-        sleep (0);
-      }
-
-      CL_rc = hc_clGetEventInfo (hashcat_ctx, event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (event_status), &event_status, &param_value_size_ret);
-
-      if (CL_rc == -1) return -1;
-
-      spin_total += device_param->spin_damp;
-
-      if (spin_total > 1) break;
-    }
-  }
-
-  CL_rc = hc_clWaitForEvents (hashcat_ctx, 1, &event);
-
-  if (CL_rc == -1) return -1;
-
-  cl_ulong time_start;
-  cl_ulong time_end;
-
-  CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL); if (CL_rc == -1) return -1;
-  CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, event, CL_PROFILING_COMMAND_END,   sizeof (time_end),   &time_end,   NULL); if (CL_rc == -1) return -1;
-
-  const double exec_us = (double) (time_end - time_start) / 1000;
-
-  if (device_param->spin_damp > 0)
-  {
-    if (status_ctx->devices_status == STATUS_RUNNING)
-    {
-      switch (kern_run)
-      {
-        case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]      = exec_us; break;
-        case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]      = exec_us; break;
-        case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]      = exec_us; break;
-        case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]      = exec_us; break;
-        case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm] = exec_us; break;
-        case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm] = exec_us; break;
-        case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]  = exec_us; break;
-        case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]  = exec_us; break;
-        case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]  = exec_us; break;
-        case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]  = exec_us; break;
-      }
-    }
-  }
-
-  if (event_update)
-  {
-    u32 exec_pos = device_param->exec_pos;
-
-    device_param->exec_msec[exec_pos] = exec_us / 1000;
-
-    exec_pos++;
-
-    if (exec_pos == EXEC_CACHE)
-    {
-      exec_pos = 0;
-    }
-
-    device_param->exec_pos = exec_pos;
-  }
-
-  CL_rc = hc_clReleaseEvent (hashcat_ctx, event);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFinish (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  return 0;
-}
-
-int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num)
-{
-  u64 num_elements = num;
-
-  switch (kern_run)
-  {
-    case KERN_RUN_MP:   device_param->kernel_params_mp_buf64[8]   = num; break;
-    case KERN_RUN_MP_R: device_param->kernel_params_mp_r_buf64[8] = num; break;
-    case KERN_RUN_MP_L: device_param->kernel_params_mp_l_buf64[9] = num; break;
-  }
-
-  u64       kernel_threads = 0;
-  cl_kernel kernel = NULL;
-
-  switch (kern_run)
-  {
-    case KERN_RUN_MP:
-      kernel          = device_param->kernel_mp;
-      kernel_threads  = device_param->kernel_wgs_mp;
-      break;
-    case KERN_RUN_MP_R:
-      kernel          = device_param->kernel_mp_r;
-      kernel_threads  = device_param->kernel_wgs_mp_r;
-      break;
-    case KERN_RUN_MP_L:
-      kernel          = device_param->kernel_mp_l;
-      kernel_threads  = device_param->kernel_wgs_mp_l;
-      break;
-    default:
-      event_log_error (hashcat_ctx, "Invalid kernel specified.");
-      return -1;
-  }
-
-  num_elements = round_up_multiple_64 (num_elements, kernel_threads);
-
-  int CL_rc;
-
-  switch (kern_run)
-  {
-    case KERN_RUN_MP:   CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp[3]);   if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp[4]);   if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp[5]);   if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp[6]);   if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp[7]);   if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 8, sizeof (cl_ulong), device_param->kernel_params_mp[8]);   if (CL_rc == -1) return -1;
-                        break;
-    case KERN_RUN_MP_R: CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp_r[3]); if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp_r[4]); if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp_r[5]); if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp_r[6]); if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp_r[7]); if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 8, sizeof (cl_ulong), device_param->kernel_params_mp_r[8]); if (CL_rc == -1) return -1;
-                        break;
-    case KERN_RUN_MP_L: CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp_l[3]); if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp_l[4]); if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp_l[5]); if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp_l[6]); if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp_l[7]); if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 8, sizeof (cl_uint),  device_param->kernel_params_mp_l[8]); if (CL_rc == -1) return -1;
-                        CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 9, sizeof (cl_ulong), device_param->kernel_params_mp_l[9]); if (CL_rc == -1) return -1;
-                        break;
-  }
-
-  const size_t global_work_size[3] = { num_elements,   1, 1 };
-  const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
-
-  CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFlush (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFinish (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  return 0;
-}
-
-int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
-{
-  const u64 num_elements = 1024; // fixed
-
-  const u64 kernel_threads = MIN (num_elements, device_param->kernel_wgs_tm);
-
-  cl_kernel kernel = device_param->kernel_tm;
-
-  const size_t global_work_size[3] = { num_elements,    1, 1 };
-  const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
-
-  int CL_rc;
-
-  CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFlush (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFinish (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  return 0;
-}
-
-int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num)
-{
-  u64 num_elements = num;
-
-  device_param->kernel_params_amp_buf64[6] = num_elements;
-
-  const u64 kernel_threads = device_param->kernel_wgs_amp;
-
-  num_elements = round_up_multiple_64 (num_elements, kernel_threads);
-
-  cl_kernel kernel = device_param->kernel_amp;
-
-  int CL_rc;
-
-  CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 6, sizeof (cl_ulong), device_param->kernel_params_amp[6]);
-
-  if (CL_rc == -1) return -1;
-
-  const size_t global_work_size[3] = { num_elements,    1, 1 };
-  const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
-
-  CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFlush (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFinish (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  return 0;
-}
-
-int run_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num)
-{
-  u64 num_elements = num;
-
-  device_param->kernel_params_atinit_buf64[1] = num_elements;
-
-  const u64 kernel_threads = device_param->kernel_wgs_atinit;
-
-  num_elements = round_up_multiple_64 (num_elements, kernel_threads);
-
-  cl_kernel kernel = device_param->kernel_atinit;
-
-  const size_t global_work_size[3] = { num_elements,    1, 1 };
-  const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
-
-  int CL_rc;
-
-  CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem), (void *) &buf);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFlush (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFinish (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  return 0;
-}
-
-int run_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size)
-{
-  const u64 num16d = size / 16;
-  const u64 num16m = size % 16;
-
-  if (num16d)
-  {
-    device_param->kernel_params_memset_buf32[1] = value;
-    device_param->kernel_params_memset_buf64[2] = num16d;
-
-    const u64 kernel_threads = device_param->kernel_wgs_memset;
-
-    u64 num_elements = num16d;
-
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
-
-    cl_kernel kernel = device_param->kernel_memset;
-
-    int CL_rc;
-
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem),   (void *) &buf);                         if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1;
-
-    const size_t global_work_size[3] = { num_elements,   1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
-
-    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = hc_clFlush (hashcat_ctx, device_param->command_queue);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = hc_clFinish (hashcat_ctx, device_param->command_queue);
-
-    if (CL_rc == -1) return -1;
-  }
-
-  if (num16m)
-  {
-    u32 tmp[4];
-
-    tmp[0] = value;
-    tmp[1] = value;
-    tmp[2] = value;
-    tmp[3] = value;
-
-    int CL_rc;
-
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, buf, CL_TRUE, num16d * 16, num16m, tmp, 0, NULL, NULL);
-
-    if (CL_rc == -1) return -1;
-  }
-
-  return 0;
-}
-
-int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num)
-{
-  u64 num_elements = num;
-
-  device_param->kernel_params_decompress_buf64[3] = num_elements;
-
-  const u64 kernel_threads = device_param->kernel_wgs_decompress;
-
-  num_elements = round_up_multiple_64 (num_elements, kernel_threads);
-
-  cl_kernel kernel = device_param->kernel_decompress;
-
-  const size_t global_work_size[3] = { num_elements,    1, 1 };
-  const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
-
-  int CL_rc;
-
-  CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFlush (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFinish (hashcat_ctx, device_param->command_queue);
-
-  if (CL_rc == -1) return -1;
-
-  return 0;
-}
-
-int run_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size)
-{
-  return run_kernel_memset (hashcat_ctx, device_param, buf, 0, size);
-}
-
-int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt)
-{
-  combinator_ctx_t     *combinator_ctx      = hashcat_ctx->combinator_ctx;
-  hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
-  user_options_t       *user_options        = hashcat_ctx->user_options;
-  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
-
-  // init speed timer
-
-  #if defined (_WIN)
-  if (device_param->timer_speed.QuadPart == 0)
-  {
-    hc_timer_set (&device_param->timer_speed);
-  }
-  #else
-  if (device_param->timer_speed.tv_sec == 0)
-  {
-    hc_timer_set (&device_param->timer_speed);
-  }
-  #endif
-
-  if (user_options->slow_candidates == true)
-  {
-    int CL_rc;
-
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL);
-
-    if (CL_rc == -1) return -1;
-
-    const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
-
-    const u32 off = pw_idx->off;
-
-    if (off)
-    {
-      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL);
-
-      if (CL_rc == -1) return -1;
-    }
-
-    CL_rc = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt);
-
-    if (CL_rc == -1) return -1;
-  }
-  else
-  {
-    if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-    {
-      int CL_rc;
-
-      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
-
-      const u32 off = pw_idx->off;
-
-      if (off)
-      {
-        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      CL_rc = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt);
-
-      if (CL_rc == -1) return -1;
-    }
-    else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-      {
-        if (user_options->attack_mode == ATTACK_MODE_COMBI)
-        {
-          if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_RIGHT)
-          {
-            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
-            {
-              rebuild_pws_compressed_append (device_param, pws_cnt, 0x01);
-            }
-            else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
-            {
-              rebuild_pws_compressed_append (device_param, pws_cnt, 0x06);
-            }
-            else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
-            {
-              rebuild_pws_compressed_append (device_param, pws_cnt, 0x80);
-            }
-          }
-        }
-        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
-        {
-          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
-          {
-            rebuild_pws_compressed_append (device_param, pws_cnt, 0x01);
-          }
-          else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
-          {
-            rebuild_pws_compressed_append (device_param, pws_cnt, 0x06);
-          }
-          else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
-          {
-            rebuild_pws_compressed_append (device_param, pws_cnt, 0x80);
-          }
-        }
-
-        int CL_rc;
-
-        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL);
-
-        if (CL_rc == -1) return -1;
-
-        const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
-
-        const u32 off = pw_idx->off;
-
-        if (off)
-        {
-          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL);
-
-          if (CL_rc == -1) return -1;
-        }
-
-        CL_rc = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt);
-
-        if (CL_rc == -1) return -1;
-      }
-      else
-      {
-        if (user_options->attack_mode == ATTACK_MODE_COMBI)
-        {
-          int CL_rc;
-
-          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL);
-
-          if (CL_rc == -1) return -1;
-
-          const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
-
-          const u32 off = pw_idx->off;
-
-          if (off)
-          {
-            CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL);
-
-            if (CL_rc == -1) return -1;
-          }
-
-          CL_rc = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt);
-
-          if (CL_rc == -1) return -1;
-        }
-        else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
-        {
-          int CL_rc;
-
-          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL);
-
-          if (CL_rc == -1) return -1;
-
-          const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
-
-          const u32 off = pw_idx->off;
-
-          if (off)
-          {
-            CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL);
-
-            if (CL_rc == -1) return -1;
-          }
-
-          CL_rc = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt);
-
-          if (CL_rc == -1) return -1;
-        }
-        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
-        {
-          const u64 off = device_param->words_off;
-
-          device_param->kernel_params_mp_buf64[3] = off;
-
-          const int CL_rc = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, pws_cnt);
-
-          if (CL_rc == -1) return -1;
-        }
-      }
-    }
-    else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
-    {
-      const u64 off = device_param->words_off;
-
-      device_param->kernel_params_mp_l_buf64[3] = off;
-
-      const int CL_rc = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP_L, pws_cnt);
-
-      if (CL_rc == -1) return -1;
-    }
-  }
-
-  return 0;
-}
-
-int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt)
-{
-  combinator_ctx_t      *combinator_ctx     = hashcat_ctx->combinator_ctx;
-  hashconfig_t          *hashconfig         = hashcat_ctx->hashconfig;
-  hashes_t              *hashes             = hashcat_ctx->hashes;
-  mask_ctx_t            *mask_ctx           = hashcat_ctx->mask_ctx;
-  status_ctx_t          *status_ctx         = hashcat_ctx->status_ctx;
-  straight_ctx_t        *straight_ctx       = hashcat_ctx->straight_ctx;
-  user_options_t        *user_options       = hashcat_ctx->user_options;
-  user_options_extra_t  *user_options_extra = hashcat_ctx->user_options_extra;
-
-  // do the on-the-fly combinator mode encoding
-
-  bool iconv_enabled = false;
-
-  iconv_t iconv_ctx = NULL;
-
-  char *iconv_tmp = NULL;
-
-  if (strcmp (user_options->encoding_from, user_options->encoding_to) != 0)
-  {
-    iconv_enabled = true;
-
-    iconv_ctx = iconv_open (user_options->encoding_to, user_options->encoding_from);
-
-    if (iconv_ctx == (iconv_t) -1) return -1;
-
-    iconv_tmp = (char *) hcmalloc (HCBUFSIZ_TINY);
-  }
-
-  // find higest password length, this is for optimization stuff
-
-  u32 highest_pw_len = 0;
-
-  if (user_options->slow_candidates == true)
-  {
-    /*
-    for (u64 pws_idx = 0; pws_idx < pws_cnt; pws_idx++)
-    {
-      pw_idx_t *pw_idx = device_param->pws_idx + pws_idx;
-
-      highest_pw_len = MAX (highest_pw_len, pw_idx->len);
-    }
-    */
-  }
-  else
-  {
-    if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-    {
-    }
-    else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
-    {
-    }
-    else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
-    {
-      highest_pw_len = device_param->kernel_params_mp_l_buf32[4]
-                     + device_param->kernel_params_mp_l_buf32[5];
-    }
-  }
-
-  // we make use of this in status view
-
-  device_param->outerloop_multi = 1;
-  device_param->outerloop_msec  = 0;
-  device_param->outerloop_pos   = 0;
-  device_param->outerloop_left  = pws_cnt;
-
-  // we ignore the time to copy data over pci bus in this case
-
-  if (user_options->speed_only == true)
-  {
-    hc_timer_set (&device_param->timer_speed);
-  }
-
-  // loop start: most outer loop = salt iteration, then innerloops (if multi)
-
-  for (u32 salt_pos = 0; salt_pos < hashes->salts_cnt; salt_pos++)
-  {
-    while (status_ctx->devices_status == STATUS_PAUSED) sleep (1);
-
-    salt_t *salt_buf = &hashes->salts_buf[salt_pos];
-
-    device_param->kernel_params_buf32[27] = salt_pos;
-    device_param->kernel_params_buf32[31] = salt_buf->digests_cnt;
-    device_param->kernel_params_buf32[32] = salt_buf->digests_offset;
-
-    FILE *combs_fp = device_param->combs_fp;
-
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      if ((user_options->attack_mode == ATTACK_MODE_COMBI) || (((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0) && (user_options->attack_mode == ATTACK_MODE_HYBRID2)))
-      {
-        rewind (combs_fp);
-      }
-    }
-
-    // iteration type
-
-    u32 innerloop_step = 0;
-    u32 innerloop_cnt  = 0;
-
-    if (user_options->slow_candidates == true)
-    {
-      innerloop_step = 1;
-      innerloop_cnt  = 1;
-    }
-    else
-    {
-      if   (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) innerloop_step = device_param->kernel_loops;
-      else                                                        innerloop_step = 1;
-
-      if      (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)  innerloop_cnt = (u32) straight_ctx->kernel_rules_cnt;
-      else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)     innerloop_cnt = (u32) combinator_ctx->combs_cnt;
-      else if (user_options_extra->attack_kern == ATTACK_KERN_BF)        innerloop_cnt = (u32) mask_ctx->bfs_cnt;
-    }
-
-    // innerloops
-
-    for (u32 innerloop_pos = 0; innerloop_pos < innerloop_cnt; innerloop_pos += innerloop_step)
-    {
-      while (status_ctx->devices_status == STATUS_PAUSED) sleep (1);
-
-      u32 fast_iteration = 0;
-
-      u32 innerloop_left = innerloop_cnt - innerloop_pos;
-
-      if (innerloop_left > innerloop_step)
-      {
-        innerloop_left = innerloop_step;
-
-        fast_iteration = 1;
-      }
-
-      hc_thread_mutex_lock (status_ctx->mux_display);
-
-      device_param->innerloop_pos  = innerloop_pos;
-      device_param->innerloop_left = innerloop_left;
-
-      device_param->kernel_params_buf32[30] = (u32) innerloop_left;
-
-      device_param->outerloop_multi = (double) innerloop_cnt / (double) (innerloop_pos + innerloop_left);
-
-      hc_thread_mutex_unlock (status_ctx->mux_display);
-
-      if (hashes->salts_shown[salt_pos] == 1)
-      {
-        status_ctx->words_progress_done[salt_pos] += (u64) pws_cnt * innerloop_left;
-
-        continue;
-      }
-
-      // initialize and copy amplifiers
-
-      if (user_options->slow_candidates == true)
-      {
-      }
-      else
-      {
-        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-        {
-          const int CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->command_queue, device_param->d_rules, device_param->d_rules_c, innerloop_pos * sizeof (kernel_rule_t), 0, innerloop_left * sizeof (kernel_rule_t), 0, NULL, NULL);
-
-          if (CL_rc == -1) return -1;
-        }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
-        {
-          if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-          {
-            if (user_options->attack_mode == ATTACK_MODE_COMBI)
-            {
-              char *line_buf = device_param->scratch_buf;
-
-              u32 i = 0;
-
-              while (i < innerloop_left)
-              {
-                if (feof (combs_fp)) break;
-
-                size_t line_len = fgetl (combs_fp, line_buf);
-
-                line_len = convert_from_hex (hashcat_ctx, line_buf, line_len);
-
-                if (line_len > PW_MAX) continue;
-
-                char *line_buf_new = line_buf;
-
-                char rule_buf_out[RP_PASSWORD_SIZE];
-
-                if (run_rule_engine (user_options_extra->rule_len_r, user_options->rule_buf_r))
-                {
-                  if (line_len >= RP_PASSWORD_SIZE) continue;
-
-                  memset (rule_buf_out, 0, sizeof (rule_buf_out));
-
-                  const int rule_len_out = _old_apply_rule (user_options->rule_buf_r, user_options_extra->rule_len_r, line_buf, (u32) line_len, rule_buf_out);
-
-                  if (rule_len_out < 0)
-                  {
-                    status_ctx->words_progress_rejected[salt_pos] += pws_cnt;
-
-                    continue;
-                  }
-
-                  line_len = rule_len_out;
-
-                  line_buf_new = rule_buf_out;
-                }
-
-                // do the on-the-fly encoding
-
-                if (iconv_enabled == true)
-                {
-                  char  *iconv_ptr = iconv_tmp;
-                  size_t iconv_sz  = HCBUFSIZ_TINY;
-
-                  const size_t iconv_rc = iconv (iconv_ctx, &line_buf_new, &line_len, &iconv_ptr, &iconv_sz);
-
-                  if (iconv_rc == (size_t) -1) continue;
-
-                  line_buf_new = iconv_tmp;
-                  line_len     = HCBUFSIZ_TINY - iconv_sz;
-                }
-
-                line_len = MIN (line_len, PW_MAX);
-
-                u8 *ptr = (u8 *) device_param->combs_buf[i].i;
-
-                memcpy (ptr, line_buf_new, line_len);
-
-                memset (ptr + line_len, 0, PW_MAX - line_len);
-
-                if (hashconfig->opts_type & OPTS_TYPE_PT_UPPER)
-                {
-                  uppercase (ptr, line_len);
-                }
-
-                if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_LEFT)
-                {
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
-                  {
-                    ptr[line_len] = 0x80;
-                  }
-
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
-                  {
-                    ptr[line_len] = 0x06;
-                  }
-
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
-                  {
-                    ptr[line_len] = 0x01;
-                  }
-                }
-
-                device_param->combs_buf[i].pw_len = (u32) line_len;
-
-                i++;
-              }
-
-              for (u32 j = i; j < innerloop_left; j++)
-              {
-                memset (&device_param->combs_buf[j], 0, sizeof (pw_t));
-              }
-
-              innerloop_left = i;
-
-              const int CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL);
-
-              if (CL_rc == -1) return -1;
-            }
-            else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
-            {
-              u64 off = innerloop_pos;
-
-              device_param->kernel_params_mp_buf64[3] = off;
-
-              int CL_rc;
-
-              CL_rc = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left);
-
-              if (CL_rc == -1) return -1;
-
-              CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->command_queue, device_param->d_combs, device_param->d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL);
-
-              if (CL_rc == -1) return -1;
-            }
-            else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
-            {
-              u64 off = innerloop_pos;
-
-              device_param->kernel_params_mp_buf64[3] = off;
-
-              int CL_rc;
-
-              CL_rc = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left);
-
-              if (CL_rc == -1) return -1;
-
-              CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->command_queue, device_param->d_combs, device_param->d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL);
-
-              if (CL_rc == -1) return -1;
-            }
-          }
-          else
-          {
-            if ((user_options->attack_mode == ATTACK_MODE_COMBI) || (user_options->attack_mode == ATTACK_MODE_HYBRID2))
-            {
-              char *line_buf = device_param->scratch_buf;
-
-              u32 i = 0;
-
-              while (i < innerloop_left)
-              {
-                if (feof (combs_fp)) break;
-
-                size_t line_len = fgetl (combs_fp, line_buf);
-
-                line_len = convert_from_hex (hashcat_ctx, line_buf, line_len);
-
-                if (line_len > PW_MAX) continue;
-
-                char *line_buf_new = line_buf;
-
-                char rule_buf_out[RP_PASSWORD_SIZE];
-
-                if (run_rule_engine (user_options_extra->rule_len_r, user_options->rule_buf_r))
-                {
-                  if (line_len >= RP_PASSWORD_SIZE) continue;
-
-                  memset (rule_buf_out, 0, sizeof (rule_buf_out));
-
-                  const int rule_len_out = _old_apply_rule (user_options->rule_buf_r, user_options_extra->rule_len_r, line_buf, (u32) line_len, rule_buf_out);
-
-                  if (rule_len_out < 0)
-                  {
-                    status_ctx->words_progress_rejected[salt_pos] += pws_cnt;
-
-                    continue;
-                  }
-
-                  line_len = rule_len_out;
-
-                  line_buf_new = rule_buf_out;
-                }
-
-                // do the on-the-fly encoding
-
-                if (iconv_enabled == true)
-                {
-                  char  *iconv_ptr = iconv_tmp;
-                  size_t iconv_sz  = HCBUFSIZ_TINY;
-
-                  const size_t iconv_rc = iconv (iconv_ctx, &line_buf_new, &line_len, &iconv_ptr, &iconv_sz);
-
-                  if (iconv_rc == (size_t) -1) continue;
-
-                  line_buf_new = iconv_tmp;
-                  line_len     = HCBUFSIZ_TINY - iconv_sz;
-                }
-
-                line_len = MIN (line_len, PW_MAX);
-
-                u8 *ptr = (u8 *) device_param->combs_buf[i].i;
-
-                memcpy (ptr, line_buf_new, line_len);
-
-                memset (ptr + line_len, 0, PW_MAX - line_len);
-
-                if (hashconfig->opts_type & OPTS_TYPE_PT_UPPER)
-                {
-                  uppercase (ptr, line_len);
-                }
-
-                /*
-                if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_LEFT)
-                {
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
-                  {
-                    ptr[line_len] = 0x80;
-                  }
-
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
-                  {
-                    ptr[line_len] = 0x06;
-                  }
-
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
-                  {
-                    ptr[line_len] = 0x01;
-                  }
-                }
-                */
-
-                device_param->combs_buf[i].pw_len = (u32) line_len;
-
-                i++;
-              }
-
-              for (u32 j = i; j < innerloop_left; j++)
-              {
-                memset (&device_param->combs_buf[j], 0, sizeof (pw_t));
-              }
-
-              innerloop_left = i;
-
-              const int CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL);
-
-              if (CL_rc == -1) return -1;
-            }
-            else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
-            {
-              u64 off = innerloop_pos;
-
-              device_param->kernel_params_mp_buf64[3] = off;
-
-              int CL_rc;
-
-              CL_rc = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left);
-
-              if (CL_rc == -1) return -1;
-
-              CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->command_queue, device_param->d_combs, device_param->d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL);
-
-              if (CL_rc == -1) return -1;
-            }
-          }
-        }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
-        {
-          u64 off = innerloop_pos;
-
-          device_param->kernel_params_mp_r_buf64[3] = off;
-
-          int CL_rc;
-
-          CL_rc = run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP_R, innerloop_left);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->command_queue, device_param->d_bfs, device_param->d_bfs_c, 0, 0, innerloop_left * sizeof (bf_t), 0, NULL, NULL);
-
-          if (CL_rc == -1) return -1;
-        }
-      }
-
-      const int rc = choose_kernel (hashcat_ctx, device_param, highest_pw_len, pws_cnt, fast_iteration, salt_pos);
-
-      if (rc == -1) return -1;
-
-      /**
-       * benchmark was aborted because too long kernel runtime (slow hashes only)
-       */
-
-      if ((user_options->speed_only == true) && (device_param->speed_only_finish == true))
-      {
-        // nothing to do in that case
-      }
-      else
-      {
-        /**
-         * speed
-         */
-
-        if (status_ctx->run_thread_level2 == true)
-        {
-          const u64 perf_sum_all = (u64) pws_cnt * innerloop_left;
-
-          const double speed_msec = hc_timer_get (device_param->timer_speed);
-
-          hc_timer_set (&device_param->timer_speed);
-
-          u32 speed_pos = device_param->speed_pos;
-
-          device_param->speed_cnt[speed_pos] = perf_sum_all;
-
-          device_param->speed_msec[speed_pos] = speed_msec;
-
-          speed_pos++;
-
-          if (speed_pos == SPEED_CACHE)
-          {
-            speed_pos = 0;
-          }
-
-          device_param->speed_pos = speed_pos;
-
-          /**
-           * progress
-           */
-
-          hc_thread_mutex_lock (status_ctx->mux_counter);
-
-          status_ctx->words_progress_done[salt_pos] += perf_sum_all;
-
-          hc_thread_mutex_unlock (status_ctx->mux_counter);
-        }
-      }
-
-      /**
-       * benchmark, part2
-       */
-
-      if (user_options->speed_only == true)
-      {
-        // let's abort this so that the user doesn't have to wait too long on the result
-        // for slow hashes it's fine anyway as boost mode should be turned on
-
-        if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
-        {
-          device_param->speed_only_finish = true;
-
-          break;
-        }
-        else
-        {
-          double total_msec = device_param->speed_msec[0];
-
-          for (u32 speed_pos = 1; speed_pos < device_param->speed_pos; speed_pos++)
-          {
-            total_msec += device_param->speed_msec[speed_pos];
-          }
-
-          if (user_options->slow_candidates == true)
-          {
-            if ((total_msec > 4000) || (device_param->speed_pos == SPEED_CACHE - 1))
-            {
-              const u32 speed_pos = device_param->speed_pos;
-
-              if (speed_pos)
-              {
-                device_param->speed_cnt[0]  = device_param->speed_cnt[speed_pos - 1];
-                device_param->speed_msec[0] = device_param->speed_msec[speed_pos - 1];
-              }
-
-              device_param->speed_pos = 0;
-
-              device_param->speed_only_finish = true;
-
-              break;
-            }
-          }
-          else
-          {
-            // it's unclear if 4s is enough to turn on boost mode for all opencl device
-
-            if ((total_msec > 4000) || (device_param->speed_pos == SPEED_CACHE - 1))
-            {
-              device_param->speed_only_finish = true;
-
-              break;
-            }
-          }
-        }
-      }
-
-      if (device_param->speed_only_finish == true) break;
-
-      /**
-       * result
-       */
-
-      check_cracked (hashcat_ctx, device_param, salt_pos);
-
-      if (status_ctx->run_thread_level2 == false) break;
-    }
-
-    if (user_options->speed_only == true) break;
-
-    //status screen makes use of this, can't reset here
-    //device_param->innerloop_msec = 0;
-    //device_param->innerloop_pos  = 0;
-    //device_param->innerloop_left = 0;
-
-    if (status_ctx->run_thread_level2 == false) break;
-  }
-
-  //status screen makes use of this, can't reset here
-  //device_param->outerloop_msec = 0;
-  //device_param->outerloop_pos  = 0;
-  //device_param->outerloop_left = 0;
-
-  if (user_options->speed_only == true)
-  {
-    double total_msec = device_param->speed_msec[0];
-
-    for (u32 speed_pos = 1; speed_pos < device_param->speed_pos; speed_pos++)
-    {
-      total_msec += device_param->speed_msec[speed_pos];
-    }
-
-    device_param->outerloop_msec = total_msec * hashes->salts_cnt * device_param->outerloop_multi;
-
-    device_param->speed_only_finish = true;
-  }
-
-  return 0;
-}
-
-int opencl_ctx_init (hashcat_ctx_t *hashcat_ctx)
-{
-  opencl_ctx_t   *opencl_ctx   = hashcat_ctx->opencl_ctx;
-  user_options_t *user_options = hashcat_ctx->user_options;
-
-  opencl_ctx->enabled = false;
-
-  if (user_options->example_hashes == true) return 0;
-  if (user_options->keyspace       == true) return 0;
-  if (user_options->left           == true) return 0;
-  if (user_options->show           == true) return 0;
-  if (user_options->usage          == true) return 0;
-  if (user_options->version        == true) return 0;
-
-  hc_device_param_t *devices_param = (hc_device_param_t *) hccalloc (DEVICES_MAX, sizeof (hc_device_param_t));
-
-  opencl_ctx->devices_param = devices_param;
-
-  /**
-   * Load and map OpenCL library calls
-   */
-
-  OCL_PTR *ocl = (OCL_PTR *) hcmalloc (sizeof (OCL_PTR));
-
-  opencl_ctx->ocl = ocl;
-
-  const int rc_ocl_init = ocl_init (hashcat_ctx);
-
-  if (rc_ocl_init == -1) return -1;
-
-  /**
-   * Some permission pre-check, because AMDGPU-PRO Driver crashes if the user has no permission to do this
-   */
-
-  const int rc_ocl_check = ocl_check_dri (hashcat_ctx);
-
-  if (rc_ocl_check == -1) return -1;
-
-  /**
-   * OpenCL platform selection
-   */
-
-  u64 opencl_platforms_filter;
-
-  const bool rc_platforms_filter = setup_opencl_platforms_filter (hashcat_ctx, user_options->opencl_platforms, &opencl_platforms_filter);
-
-  if (rc_platforms_filter == false) return -1;
-
-  opencl_ctx->opencl_platforms_filter = opencl_platforms_filter;
-
-  /**
-   * OpenCL device selection
-   */
-
-  u64 devices_filter;
-
-  const bool rc_devices_filter = setup_devices_filter (hashcat_ctx, user_options->opencl_devices, &devices_filter);
-
-  if (rc_devices_filter == false) return -1;
-
-  opencl_ctx->devices_filter = devices_filter;
-
-  /**
-   * OpenCL device type selection
-   */
-
-  cl_device_type device_types_filter;
-
-  const bool rc_device_types_filter = setup_device_types_filter (hashcat_ctx, user_options->opencl_device_types, &device_types_filter);
-
-  if (rc_device_types_filter == false) return -1;
-
-  opencl_ctx->device_types_filter = device_types_filter;
-
-  /**
-   * OpenCL platforms: detect
-   */
-
-  char          **platforms_vendor      = (char **) hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
-  char          **platforms_name        = (char **) hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
-  char          **platforms_version     = (char **) hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
-  bool           *platforms_skipped     = (bool *)  hccalloc (CL_PLATFORMS_MAX, sizeof (bool));
-  cl_uint         platforms_cnt         = 0;
-  cl_platform_id *platforms             = (cl_platform_id *) hccalloc (CL_PLATFORMS_MAX, sizeof (cl_platform_id));
-  cl_uint         platform_devices_cnt  = 0;
-  cl_device_id   *platform_devices      = (cl_device_id *) hccalloc (DEVICES_MAX, sizeof (cl_device_id));
-
-  int CL_rc = hc_clGetPlatformIDs (hashcat_ctx, CL_PLATFORMS_MAX, platforms, &platforms_cnt);
-
-  #define FREE_OPENCL_CTX_ON_ERROR \
-  {                                \
-      hcfree (platforms_vendor);   \
-      hcfree (platforms_name);     \
-      hcfree (platforms_version);  \
-      hcfree (platforms_skipped);  \
-      hcfree (platforms);          \
-      hcfree (platform_devices);   \
-  }
-
-  if (CL_rc == -1)
-  {
-    FREE_OPENCL_CTX_ON_ERROR;
-
-    return -1;
-  }
-
-  if (platforms_cnt == 0)
-  {
-    event_log_error (hashcat_ctx, "ATTENTION! No OpenCL-compatible platform found.");
-
-    event_log_warning (hashcat_ctx, "You are probably missing the OpenCL runtime installation.");
-    event_log_warning (hashcat_ctx, NULL);
-
-    #if defined (__linux__)
-    event_log_warning (hashcat_ctx, "* AMD GPUs on Linux require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"AMDGPU-PRO Driver\" (16.40 or later)");
-    #elif defined (_WIN)
-    event_log_warning (hashcat_ctx, "* AMD GPUs on Windows require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"AMD Radeon Software Crimson Edition\" (15.12 or later)");
-    #endif
-
-    event_log_warning (hashcat_ctx, "* Intel CPUs require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"OpenCL Runtime for Intel Core and Intel Xeon Processors\" (16.1.1 or later)");
-
-    #if defined (__linux__)
-    event_log_warning (hashcat_ctx, "* Intel GPUs on Linux require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"OpenCL 2.0 GPU Driver Package for Linux\" (2.0 or later)");
-    #elif defined (_WIN)
-    event_log_warning (hashcat_ctx, "* Intel GPUs on Windows require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"OpenCL Driver for Intel Iris and Intel HD Graphics\"");
-    #endif
-
-    event_log_warning (hashcat_ctx, "* NVIDIA GPUs require this runtime and/or driver:");
-    event_log_warning (hashcat_ctx, "  \"NVIDIA Driver\" (418.56 or later)");
-    event_log_warning (hashcat_ctx, NULL);
-
-    FREE_OPENCL_CTX_ON_ERROR;
-
-    return -1;
-  }
-
-  if (opencl_platforms_filter != (u64) -1)
-  {
-    u64 platform_cnt_mask = ~(((u64) -1 >> platforms_cnt) << platforms_cnt);
-
-    if (opencl_platforms_filter > platform_cnt_mask)
-    {
-      event_log_error (hashcat_ctx, "An invalid platform was specified using the --opencl-platforms parameter.");
-      event_log_error (hashcat_ctx, "The specified platform was higher than the number of available platforms (%u).", platforms_cnt);
-
-      FREE_OPENCL_CTX_ON_ERROR;
-
-      return -1;
-    }
-  }
-
-  if (user_options->opencl_device_types == NULL)
-  {
-    /**
-     * OpenCL device types:
-     *   In case the user did not specify --opencl-device-types and the user runs hashcat in a system with only a CPU only he probably want to use that CPU.
-     */
-
-    cl_device_type device_types_all = 0;
-
-    for (u32 platform_id = 0; platform_id < platforms_cnt; platform_id++)
-    {
-      if ((opencl_platforms_filter & (1ULL << platform_id)) == 0) continue;
-
-      cl_platform_id platform = platforms[platform_id];
-
-      CL_rc = hc_clGetDeviceIDs (hashcat_ctx, platform, CL_DEVICE_TYPE_ALL, DEVICES_MAX, platform_devices, &platform_devices_cnt);
-
-      if (CL_rc == -1) continue;
-
-      for (u32 platform_devices_id = 0; platform_devices_id < platform_devices_cnt; platform_devices_id++)
-      {
-        cl_device_id device = platform_devices[platform_devices_id];
-
-        cl_device_type device_type;
-
-        CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device, CL_DEVICE_TYPE, sizeof (device_type), &device_type, NULL);
-
-        if (CL_rc == -1)
-        {
-          FREE_OPENCL_CTX_ON_ERROR;
-
-          return -1;
-        }
-
-        device_types_all |= device_type;
-      }
-    }
-
-    // In such a case, automatically enable cpu_md5CPU device type support, since it's disabled by default.
-
-    if ((device_types_all & (CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR)) == 0)
-    {
-      device_types_filter |= CL_DEVICE_TYPE_CPU;
-    }
-
-    // In another case, when the user uses --stdout, using CPU devices is much faster to setup
-    // If we have a CPU device, force it to be used
-
-    if (user_options->stdout_flag == true)
-    {
-      if (device_types_all & CL_DEVICE_TYPE_CPU)
-      {
-        device_types_filter = CL_DEVICE_TYPE_CPU;
-      }
-    }
-
-    opencl_ctx->device_types_filter = device_types_filter;
-  }
-
-  opencl_ctx->enabled = true;
-
-  opencl_ctx->platforms_vendor      = platforms_vendor;
-  opencl_ctx->platforms_name        = platforms_name;
-  opencl_ctx->platforms_version     = platforms_version;
-  opencl_ctx->platforms_skipped     = platforms_skipped;
-  opencl_ctx->platforms_cnt         = platforms_cnt;
-  opencl_ctx->platforms             = platforms;
-  opencl_ctx->platform_devices_cnt  = platform_devices_cnt;
-  opencl_ctx->platform_devices      = platform_devices;
-
-  return 0;
-}
-
-void opencl_ctx_destroy (hashcat_ctx_t *hashcat_ctx)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  if (opencl_ctx->enabled == false) return;
-
-  ocl_close (hashcat_ctx);
-
-  hcfree (opencl_ctx->devices_param);
-
-  hcfree (opencl_ctx->platforms);
-  hcfree (opencl_ctx->platform_devices);
-  hcfree (opencl_ctx->platforms_vendor);
-  hcfree (opencl_ctx->platforms_name);
-  hcfree (opencl_ctx->platforms_version);
-  hcfree (opencl_ctx->platforms_skipped);
-
-  memset (opencl_ctx, 0, sizeof (opencl_ctx_t));
-}
-
-int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
-{
-  opencl_ctx_t   *opencl_ctx   = hashcat_ctx->opencl_ctx;
-  user_options_t *user_options = hashcat_ctx->user_options;
-
-  if (opencl_ctx->enabled == false) return 0;
-
-  /**
-   * OpenCL devices: simply push all devices from all platforms into the same device array
-   */
-
-  cl_uint         platforms_cnt         = opencl_ctx->platforms_cnt;
-  cl_platform_id *platforms             = opencl_ctx->platforms;
-  cl_uint         platform_devices_cnt  = opencl_ctx->platform_devices_cnt;
-  cl_device_id   *platform_devices      = opencl_ctx->platform_devices;
-
-  bool need_adl     = false;
-  bool need_nvml    = false;
-  bool need_nvapi   = false;
-  bool need_sysfs   = false;
-
-  u32 devices_cnt = 0;
-
-  u32 devices_active = 0;
-
-  for (u32 platform_id = 0; platform_id < platforms_cnt; platform_id++)
-  {
-    size_t param_value_size = 0;
-
-    cl_platform_id platform = platforms[platform_id];
-
-    // platform vendor
-
-    int CL_rc;
-
-    CL_rc = hc_clGetPlatformInfo (hashcat_ctx, platform, CL_PLATFORM_VENDOR, 0, NULL, &param_value_size);
-
-    if (CL_rc == -1) return -1;
-
-    char *platform_vendor = (char *) hcmalloc (param_value_size);
-
-    CL_rc = hc_clGetPlatformInfo (hashcat_ctx, platform, CL_PLATFORM_VENDOR, param_value_size, platform_vendor, NULL);
-
-    if (CL_rc == -1) return -1;
-
-    opencl_ctx->platforms_vendor[platform_id] = platform_vendor;
-
-    // platform name
-
-    CL_rc = hc_clGetPlatformInfo (hashcat_ctx, platform, CL_PLATFORM_NAME, 0, NULL, &param_value_size);
-
-    if (CL_rc == -1) return -1;
-
-    char *platform_name = (char *) hcmalloc (param_value_size);
-
-    CL_rc = hc_clGetPlatformInfo (hashcat_ctx, platform, CL_PLATFORM_NAME, param_value_size, platform_name, NULL);
-
-    if (CL_rc == -1) return -1;
-
-    opencl_ctx->platforms_name[platform_id] = platform_name;
-
-    // platform version
-
-    CL_rc = hc_clGetPlatformInfo (hashcat_ctx, platform, CL_PLATFORM_VERSION, 0, NULL, &param_value_size);
-
-    if (CL_rc == -1) return -1;
-
-    char *platform_version = (char *) hcmalloc (param_value_size);
-
-    CL_rc = hc_clGetPlatformInfo (hashcat_ctx, platform, CL_PLATFORM_VERSION, param_value_size, platform_version, NULL);
-
-    if (CL_rc == -1) return -1;
-
-    opencl_ctx->platforms_version[platform_id] = platform_version;
-
-    // find our own platform vendor because pocl and mesa are pushing original vendor_id through opencl
-    // this causes trouble with vendor id based macros
-    // we'll assign generic to those without special optimization available
-
-    cl_uint platform_vendor_id = 0;
-
-    if (strcmp (platform_vendor, CL_VENDOR_AMD1) == 0)
-    {
-      platform_vendor_id = VENDOR_ID_AMD;
-    }
-    else if (strcmp (platform_vendor, CL_VENDOR_AMD2) == 0)
-    {
-      platform_vendor_id = VENDOR_ID_AMD;
-    }
-    else if (strcmp (platform_vendor, CL_VENDOR_AMD_USE_INTEL) == 0)
-    {
-      platform_vendor_id = VENDOR_ID_AMD_USE_INTEL;
-    }
-    else if (strcmp (platform_vendor, CL_VENDOR_APPLE) == 0)
-    {
-      platform_vendor_id = VENDOR_ID_APPLE;
-    }
-    else if (strcmp (platform_vendor, CL_VENDOR_INTEL_BEIGNET) == 0)
-    {
-      platform_vendor_id = VENDOR_ID_INTEL_BEIGNET;
-    }
-    else if (strcmp (platform_vendor, CL_VENDOR_INTEL_SDK) == 0)
-    {
-      platform_vendor_id = VENDOR_ID_INTEL_SDK;
-    }
-    else if (strcmp (platform_vendor, CL_VENDOR_MESA) == 0)
-    {
-      platform_vendor_id = VENDOR_ID_MESA;
-    }
-    else if (strcmp (platform_vendor, CL_VENDOR_NV) == 0)
-    {
-      platform_vendor_id = VENDOR_ID_NV;
-    }
-    else if (strcmp (platform_vendor, CL_VENDOR_POCL) == 0)
-    {
-      platform_vendor_id = VENDOR_ID_POCL;
-    }
-    else
-    {
-      platform_vendor_id = VENDOR_ID_GENERIC;
-    }
-
-    bool platform_skipped = ((opencl_ctx->opencl_platforms_filter & (1ULL << platform_id)) == 0);
-
-    CL_rc = hc_clGetDeviceIDs (hashcat_ctx, platform, CL_DEVICE_TYPE_ALL, DEVICES_MAX, platform_devices, &platform_devices_cnt);
-
-    if (CL_rc == -1)
-    {
-      //event_log_error (hashcat_ctx, "clGetDeviceIDs(): %s", val2cstr_cl (CL_rc));
-
-      //return -1;
-
-      platform_skipped = true;
-    }
-
-    opencl_ctx->platforms_skipped[platform_id] = platform_skipped;
-
-    if (platform_skipped == true) continue;
-
-    if (user_options->force == false)
-    {
-      if (platform_vendor_id == VENDOR_ID_MESA)
-      {
-        event_log_error (hashcat_ctx, "Mesa (Gallium) OpenCL platform detected!");
-
-        event_log_warning (hashcat_ctx, "The Mesa platform can cause errors that are often mistaken for bugs in hashcat.");
-        event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the drivers listed in docs/readme.txt.");
-        event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
-        event_log_warning (hashcat_ctx, "You can also use --opencl-platforms to skip the Mesa platform(s).");
-        event_log_warning (hashcat_ctx, NULL);
-
-        return -1;
-      }
-    }
-
-    hc_device_param_t *devices_param = opencl_ctx->devices_param;
-
-    for (u32 platform_devices_id = 0; platform_devices_id < platform_devices_cnt; platform_devices_id++)
-    {
-      const u32 device_id = devices_cnt;
-
-      hc_device_param_t *device_param = &devices_param[device_id];
-
-      device_param->platform_vendor_id = platform_vendor_id;
-
-      device_param->device = platform_devices[platform_devices_id];
-
-      device_param->device_id = device_id;
-
-      device_param->platform_devices_id = platform_devices_id;
-
-      device_param->platform = platform;
-
-      // device_type
-
-      cl_device_type device_type;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_TYPE, sizeof (device_type), &device_type, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_type &= ~CL_DEVICE_TYPE_DEFAULT;
-
-      device_param->device_type = device_type;
-
-      // device_name
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_NAME, 0, NULL, &param_value_size);
-
-      if (CL_rc == -1) return -1;
-
-      char *device_name = (char *) hcmalloc (param_value_size);
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_NAME, param_value_size, device_name, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_param->device_name = device_name;
-
-      hc_string_trim_leading (device_param->device_name);
-
-      hc_string_trim_trailing (device_param->device_name);
-
-      // device_vendor
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_VENDOR, 0, NULL, &param_value_size);
-
-      if (CL_rc == -1) return -1;
-
-      char *device_vendor = (char *) hcmalloc (param_value_size);
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_VENDOR, param_value_size, device_vendor, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_param->device_vendor = device_vendor;
-
-      cl_uint device_vendor_id = 0;
-
-      if (strcmp (device_vendor, CL_VENDOR_AMD1) == 0)
-      {
-        device_vendor_id = VENDOR_ID_AMD;
-      }
-      else if (strcmp (device_vendor, CL_VENDOR_AMD2) == 0)
-      {
-        device_vendor_id = VENDOR_ID_AMD;
-      }
-      else if (strcmp (device_vendor, CL_VENDOR_AMD_USE_INTEL) == 0)
-      {
-        device_vendor_id = VENDOR_ID_AMD_USE_INTEL;
-      }
-      else if (strcmp (device_vendor, CL_VENDOR_APPLE) == 0)
-      {
-        device_vendor_id = VENDOR_ID_APPLE;
-      }
-      else if (strcmp (device_vendor, CL_VENDOR_APPLE_USE_AMD) == 0)
-      {
-        device_vendor_id = VENDOR_ID_AMD;
-      }
-      else if (strcmp (device_vendor, CL_VENDOR_APPLE_USE_NV) == 0)
-      {
-        device_vendor_id = VENDOR_ID_NV;
-      }
-      else if (strcmp (device_vendor, CL_VENDOR_APPLE_USE_INTEL) == 0)
-      {
-        device_vendor_id = VENDOR_ID_INTEL_SDK;
-      }
-      else if (strcmp (device_vendor, CL_VENDOR_INTEL_BEIGNET) == 0)
-      {
-        device_vendor_id = VENDOR_ID_INTEL_BEIGNET;
-      }
-      else if (strcmp (device_vendor, CL_VENDOR_INTEL_SDK) == 0)
-      {
-        device_vendor_id = VENDOR_ID_INTEL_SDK;
-      }
-      else if (strcmp (device_vendor, CL_VENDOR_MESA) == 0)
-      {
-        device_vendor_id = VENDOR_ID_MESA;
-      }
-      else if (strcmp (device_vendor, CL_VENDOR_NV) == 0)
-      {
-        device_vendor_id = VENDOR_ID_NV;
-      }
-      else if (strcmp (device_vendor, CL_VENDOR_POCL) == 0)
-      {
-        device_vendor_id = VENDOR_ID_POCL;
-      }
-      else
-      {
-        device_vendor_id = VENDOR_ID_GENERIC;
-      }
-
-      device_param->device_vendor_id = device_vendor_id;
-
-      // device_version
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_VERSION, 0, NULL, &param_value_size);
-
-      if (CL_rc == -1) return -1;
-
-      char *device_version = (char *) hcmalloc (param_value_size);
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_VERSION, param_value_size, device_version, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_param->device_version = device_version;
-
-      // device_opencl_version
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &param_value_size);
-
-      if (CL_rc == -1) return -1;
-
-      char *device_opencl_version = (char *) hcmalloc (param_value_size);
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_OPENCL_C_VERSION, param_value_size, device_opencl_version, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_param->device_opencl_version = device_opencl_version;
-
-      // max_compute_units
-
-      cl_uint device_processors;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (device_processors), &device_processors, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_param->device_processors = device_processors;
-
-      // device_global_mem
-
-      cl_ulong device_global_mem;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof (device_global_mem), &device_global_mem, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_param->device_global_mem = device_global_mem;
-
-      device_param->device_available_mem = 0;
-
-      // device_maxmem_alloc
-
-      cl_ulong device_maxmem_alloc;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof (device_maxmem_alloc), &device_maxmem_alloc, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_param->device_maxmem_alloc = device_maxmem_alloc;
-
-      // note we'll limit to 2gb, otherwise this causes all kinds of weird errors because of possible integer overflows in opencl runtimes
-      // testwise disabling that
-      //device_param->device_maxmem_alloc = MIN (device_maxmem_alloc, 0x7fffffff);
-
-      // max_work_group_size
-
-      size_t device_maxworkgroup_size;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (device_maxworkgroup_size), &device_maxworkgroup_size, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_param->device_maxworkgroup_size = device_maxworkgroup_size;
-
-      // max_clock_frequency
-
-      cl_uint device_maxclock_frequency;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof (device_maxclock_frequency), &device_maxclock_frequency, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_param->device_maxclock_frequency = device_maxclock_frequency;
-
-      // device_endian_little
-
-      cl_bool device_endian_little;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_ENDIAN_LITTLE, sizeof (device_endian_little), &device_endian_little, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      if (device_endian_little == CL_FALSE)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: This device is not little-endian.", device_id + 1);
-
-        device_param->skipped = true;
-      }
-
-      // device_available
-
-      cl_bool device_available;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_AVAILABLE, sizeof (device_available), &device_available, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      if (device_available == CL_FALSE)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: This device is not available.", device_id + 1);
-
-        device_param->skipped = true;
-      }
-
-      // device_compiler_available
-
-      cl_bool device_compiler_available;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_COMPILER_AVAILABLE, sizeof (device_compiler_available), &device_compiler_available, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      if (device_compiler_available == CL_FALSE)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: No compiler is available for this device.", device_id + 1);
-
-        device_param->skipped = true;
-      }
-
-      // device_execution_capabilities
-
-      cl_device_exec_capabilities device_execution_capabilities;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof (device_execution_capabilities), &device_execution_capabilities, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      if ((device_execution_capabilities & CL_EXEC_KERNEL) == 0)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: This device does not support executing kernels.", device_id + 1);
-
-        device_param->skipped = true;
-      }
-
-      // device_extensions
-
-      size_t device_extensions_size;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_EXTENSIONS, 0, NULL, &device_extensions_size);
-
-      if (CL_rc == -1) return -1;
-
-      char *device_extensions = hcmalloc (device_extensions_size + 1);
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_EXTENSIONS, device_extensions_size, device_extensions, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      if (strstr (device_extensions, "base_atomics") == 0)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: This device does not support base atomics.", device_id + 1);
-
-        device_param->skipped = true;
-      }
-
-      if (strstr (device_extensions, "byte_addressable_store") == 0)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: This device does not support byte-addressable store.", device_id + 1);
-
-        device_param->skipped = true;
-      }
-
-      hcfree (device_extensions);
-
-      // device_max_constant_buffer_size
-
-      cl_ulong device_max_constant_buffer_size;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof (device_max_constant_buffer_size), &device_max_constant_buffer_size, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      if (device_max_constant_buffer_size < 65536)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: This device's constant buffer size is too small.", device_id + 1);
-
-        device_param->skipped = true;
-      }
-
-      // device_local_mem_size
-
-      cl_ulong device_local_mem_size;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (device_local_mem_size), &device_local_mem_size, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      if (device_local_mem_size < 32768)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: This device's local mem size is too small.", device_id + 1);
-
-        device_param->skipped = true;
-      }
-
-      device_param->device_local_mem_size = device_local_mem_size;
-
-      // device_local_mem_type
-
-      cl_device_local_mem_type device_local_mem_type;
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof (device_local_mem_type), &device_local_mem_type, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_param->device_local_mem_type = device_local_mem_type;
-
-      // If there's both an Intel CPU and an AMD OpenCL runtime it's a tricky situation
-      // Both platforms support CPU device types and therefore both will try to use 100% of the physical resources
-      // This results in both utilizing it for 50%
-      // However, Intel has much better SIMD control over their own hardware
-      // It makes sense to give them full control over their own hardware
-
-      if (device_type & CL_DEVICE_TYPE_CPU)
-      {
-        if (device_param->device_vendor_id == VENDOR_ID_AMD_USE_INTEL)
-        {
-          if (user_options->force == false)
-          {
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Not a native Intel OpenCL runtime. Expect massive speed loss.", device_id + 1);
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             You can use --force to override, but do not report related errors.");
-
-            device_param->skipped = true;
-          }
-        }
-      }
-
-      // Since some times we get reports from users about not working hashcat, dropping error messages like:
-      // CL_INVALID_COMMAND_QUEUE and CL_OUT_OF_RESOURCES
-      // Turns out that this is caused by Intel OpenCL runtime handling their GPU devices
-      // Disable such devices unless the user forces to use it
-
-      #if !defined (__APPLE__)
-      if (device_type & CL_DEVICE_TYPE_GPU)
-      {
-        if ((device_param->device_vendor_id == VENDOR_ID_INTEL_SDK) || (device_param->device_vendor_id == VENDOR_ID_INTEL_BEIGNET))
-        {
-          if (user_options->force == false)
-          {
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Intel's OpenCL runtime (GPU only) is currently broken.", device_id + 1);
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             We are waiting for updated OpenCL drivers from Intel.");
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             You can use --force to override, but do not report related errors.");
-
-            device_param->skipped = true;
-          }
-        }
-      }
-      #endif // __APPLE__
-
-      // skipped
-
-      if ((opencl_ctx->devices_filter & (1ULL << device_id)) == 0)
-      {
-        device_param->skipped = true;
-      }
-
-      if ((opencl_ctx->device_types_filter & (device_type)) == 0)
-      {
-        device_param->skipped = true;
-      }
-
-      // driver_version
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DRIVER_VERSION, 0, NULL, &param_value_size);
-
-      if (CL_rc == -1) return -1;
-
-      char *driver_version = (char *) hcmalloc (param_value_size);
-
-      CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DRIVER_VERSION, param_value_size, driver_version, NULL);
-
-      if (CL_rc == -1) return -1;
-
-      device_param->driver_version = driver_version;
-
-      // vendor specific
-
-      if (device_param->device_type & CL_DEVICE_TYPE_GPU)
-      {
-        if ((device_param->platform_vendor_id == VENDOR_ID_AMD) && (device_param->device_vendor_id == VENDOR_ID_AMD))
-        {
-          need_adl = true;
-
-          #if defined (__linux__)
-          need_sysfs = true;
-          #endif
-        }
-
-        if ((device_param->platform_vendor_id == VENDOR_ID_NV) && (device_param->device_vendor_id == VENDOR_ID_NV))
-        {
-          need_nvml = true;
-
-          #if defined (_WIN) || defined (__CYGWIN__)
-          need_nvapi = true;
-          #endif
-        }
-      }
-
-      if (device_param->device_type & CL_DEVICE_TYPE_GPU)
-      {
-        if ((device_param->platform_vendor_id == VENDOR_ID_AMD) && (device_param->device_vendor_id == VENDOR_ID_AMD))
-        {
-          cl_device_topology_amd amdtopo;
-
-          CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_TOPOLOGY_AMD, sizeof (amdtopo), &amdtopo, NULL);
-
-          if (CL_rc == -1) return -1;
-
-          device_param->pcie_bus      = amdtopo.pcie.bus;
-          device_param->pcie_device   = amdtopo.pcie.device;
-          device_param->pcie_function = amdtopo.pcie.function;
-        }
-
-        if ((device_param->platform_vendor_id == VENDOR_ID_NV) && (device_param->device_vendor_id == VENDOR_ID_NV))
-        {
-          cl_uint pci_bus_id_nv;  // is cl_uint the right type for them??
-          cl_uint pci_slot_id_nv;
-
-          CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_PCI_BUS_ID_NV, sizeof (pci_bus_id_nv), &pci_bus_id_nv, NULL);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_PCI_SLOT_ID_NV, sizeof (pci_slot_id_nv), &pci_slot_id_nv, NULL);
-
-          if (CL_rc == -1) return -1;
-
-          device_param->pcie_bus      = (u8) (pci_bus_id_nv);
-          device_param->pcie_device   = (u8) (pci_slot_id_nv >> 3);
-          device_param->pcie_function = (u8) (pci_slot_id_nv & 7);
-
-          cl_uint sm_minor = 0;
-          cl_uint sm_major = 0;
-
-          CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof (sm_minor), &sm_minor, NULL);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof (sm_major), &sm_major, NULL);
-
-          if (CL_rc == -1) return -1;
-
-          device_param->sm_minor = sm_minor;
-          device_param->sm_major = sm_major;
-
-          cl_uint kernel_exec_timeout = 0;
-
-          CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof (kernel_exec_timeout), &kernel_exec_timeout, NULL);
-
-          if (CL_rc == -1) return -1;
-
-          device_param->kernel_exec_timeout = kernel_exec_timeout;
-
-          // CPU burning loop damper
-          // Value is given as number between 0-100
-          // By default 8%
-
-          device_param->spin_damp = (double) user_options->spin_damp / 100;
-        }
-      }
-
-      // common driver check
-
-      if (device_param->skipped == false)
-      {
-        if ((user_options->force == false) && (user_options->opencl_info == false))
-        {
-          if (device_type & CL_DEVICE_TYPE_CPU)
-          {
-            if (device_param->platform_vendor_id == VENDOR_ID_INTEL_SDK)
-            {
-              bool intel_warn = false;
-
-              // Intel OpenCL runtime 18
-
-              int opencl_driver1 = 0;
-              int opencl_driver2 = 0;
-              int opencl_driver3 = 0;
-              int opencl_driver4 = 0;
-
-              const int res18 = sscanf (device_param->driver_version, "%u.%u.%u.%u", &opencl_driver1, &opencl_driver2, &opencl_driver3, &opencl_driver4);
-
-              if (res18 == 4)
-              {
-                // so far all versions 18 are ok
-              }
-              else
-              {
-                // Intel OpenCL runtime 16
-
-                float opencl_version = 0;
-                int   opencl_build   = 0;
-
-                const int res16 = sscanf (device_param->device_version, "OpenCL %f (Build %d)", &opencl_version, &opencl_build);
-
-                if (res16 == 2)
-                {
-                  if (opencl_build < 25) intel_warn = true;
-                }
-              }
-
-              if (intel_warn == true)
-              {
-                event_log_error (hashcat_ctx, "* Device #%u: Outdated or broken Intel OpenCL runtime '%s' detected!", device_id + 1, device_param->driver_version);
-
-                event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported NVIDIA driver.");
-                event_log_warning (hashcat_ctx, "See hashcat.net for officially supported NVIDIA drivers.");
-                event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
-                event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
-                event_log_warning (hashcat_ctx, NULL);
-
-                return -1;
-              }
-            }
-          }
-          else if (device_type & CL_DEVICE_TYPE_GPU)
-          {
-            if (device_param->platform_vendor_id == VENDOR_ID_AMD)
-            {
-              bool amd_warn = true;
-
-              #if defined (__linux__)
-              // AMDGPU-PRO Driver 16.40 and higher
-              if (strtoul (device_param->driver_version, NULL, 10) >= 2117) amd_warn = false;
-              // AMDGPU-PRO Driver 16.50 is known to be broken
-              if (strtoul (device_param->driver_version, NULL, 10) == 2236) amd_warn = true;
-              // AMDGPU-PRO Driver 16.60 is known to be broken
-              if (strtoul (device_param->driver_version, NULL, 10) == 2264) amd_warn = true;
-              // AMDGPU-PRO Driver 17.10 is known to be broken
-              if (strtoul (device_param->driver_version, NULL, 10) == 2348) amd_warn = true;
-              // AMDGPU-PRO Driver 17.20 (2416) is fine, doesn't need check will match >= 2117
-              #elif defined (_WIN)
-              // AMD Radeon Software 14.9 and higher, should be updated to 15.12
-              if (strtoul (device_param->driver_version, NULL, 10) >= 1573) amd_warn = false;
-              #else
-              // we have no information about other os
-              if (amd_warn == true) amd_warn = false;
-              #endif
-
-              if (amd_warn == true)
-              {
-                event_log_error (hashcat_ctx, "* Device #%u: Outdated or broken AMD driver '%s' detected!", device_id + 1, device_param->driver_version);
-
-                event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported AMD driver.");
-                event_log_warning (hashcat_ctx, "See hashcat.net for officially supported AMD drivers.");
-                event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
-                event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
-                event_log_warning (hashcat_ctx, NULL);
-
-                return -1;
-              }
-            }
-
-            if (device_param->platform_vendor_id == VENDOR_ID_NV)
-            {
-              int nv_warn = true;
-
-              int version_maj = 0;
-              int version_min = 0;
-
-              const int r = sscanf (device_param->driver_version, "%d.%d", &version_maj, &version_min);
-
-              if (r == 2)
-              {
-                if (version_maj >= 367)
-                {
-                  if (version_maj == 418)
-                  {
-                    // older 418.x versions are known to be broken.
-                    // for instance, NVIDIA-Linux-x86_64-418.43.run
-                    // run ./hashcat -b -m 2501 results in self-test fail
-
-                    if (version_min >= 56)
-                    {
-                      nv_warn = false;
-                    }
-                  }
-                  else
-                  {
-                    nv_warn = false;
-                  }
-                }
-              }
-              else
-              {
-                // unknown version scheme, probably new driver version
-
-                nv_warn = false;
-              }
-
-              if (nv_warn == true)
-              {
-                event_log_error (hashcat_ctx, "* Device #%u: Outdated or broken NVIDIA driver '%s' detected!", device_id + 1, device_param->driver_version);
-
-                event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported NVIDIA driver.");
-                event_log_warning (hashcat_ctx, "See hashcat's homepage for officially supported NVIDIA drivers.");
-                event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
-                event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
-                event_log_warning (hashcat_ctx, NULL);
-
-                return -1;
-              }
-
-              if (device_param->sm_major < 5)
-              {
-                if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: This hardware has outdated CUDA compute capability (%u.%u).", device_id + 1, device_param->sm_major, device_param->sm_minor);
-                if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             For modern OpenCL performance, upgrade to hardware that supports");
-                if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             CUDA compute capability version 5.0 (Maxwell) or higher.");
-              }
-
-              if (device_param->kernel_exec_timeout != 0)
-              {
-                if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: WARNING! Kernel exec timeout is not disabled.", device_id + 1);
-                if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             This may cause \"CL_OUT_OF_RESOURCES\" or related errors.");
-                if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             To disable the timeout, see: https://hashcat.net/q/timeoutpatch");
-              }
-            }
-
-            if ((strstr (device_param->device_opencl_version, "beignet")) || (strstr (device_param->device_version, "beignet")))
-            {
-              event_log_error (hashcat_ctx, "* Device #%u: Intel beignet driver detected!", device_id + 1);
-
-              event_log_warning (hashcat_ctx, "The beignet driver has been marked as likely to fail kernel compilation.");
-              event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
-              event_log_warning (hashcat_ctx, NULL);
-
-              return -1;
-            }
-          }
-        }
-
-        /**
-         * activate device
-         */
-
-        devices_active++;
-      }
-
-      /**
-       * create context for each device
-       */
-
-      cl_context context;
-
-      cl_context_properties properties[3];
-
-      properties[0] = CL_CONTEXT_PLATFORM;
-      properties[1] = (cl_context_properties) device_param->platform;
-      properties[2] = 0;
-
-      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->device, NULL, NULL, &context);
-
-      if (CL_rc == -1) return -1;
-
-      /**
-       * create command-queue
-       */
-
-      cl_command_queue command_queue;
-
-      CL_rc = hc_clCreateCommandQueue (hashcat_ctx, context, device_param->device, 0, &command_queue);
-
-      if (CL_rc == -1) return -1;
-
-      if ((device_param->device_type & CL_DEVICE_TYPE_GPU) && (device_param->platform_vendor_id == VENDOR_ID_AMD))
-      {
-        const bool has_vadd3 = test_instruction (hashcat_ctx, context, device_param->device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_ADD3_U32 %0, 0, 0, 0;\" : \"=v\"(r)); }");
-
-        device_param->has_vadd3 = has_vadd3;
-
-        const bool has_vbfe = test_instruction (hashcat_ctx, context, device_param->device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_BFE_U32 %0, 0, 0, 0;\" : \"=v\"(r)); }");
-
-        device_param->has_vbfe = has_vbfe;
-
-        const bool has_vperm = test_instruction (hashcat_ctx, context, device_param->device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_PERM_B32 %0, 0, 0, 0;\" : \"=v\"(r)); }");
-
-        device_param->has_vperm = has_vperm;
-      }
-
-      if ((device_param->device_type & CL_DEVICE_TYPE_GPU) && (device_param->platform_vendor_id == VENDOR_ID_NV))
-      {
-        const bool has_bfe = test_instruction (hashcat_ctx, context, device_param->device, "__kernel void test () { uint r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");
-
-        device_param->has_bfe = has_bfe;
-
-        const bool has_lop3 = test_instruction (hashcat_ctx, context, device_param->device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");
-
-        device_param->has_lop3 = has_lop3;
-
-        const bool has_mov64 = test_instruction (hashcat_ctx, context, device_param->device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");
-
-        device_param->has_mov64 = has_mov64;
-
-        const bool has_prmt = test_instruction (hashcat_ctx, context, device_param->device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");
-
-        device_param->has_prmt = has_prmt;
-      }
-
-      // device_available_mem
-
-      #define MAX_ALLOC_CHECKS_CNT  8192
-      #define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024)
-
-      device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE;
-
-      #if defined (_WIN)
-      if ((device_param->device_type & CL_DEVICE_TYPE_GPU) && (device_param->platform_vendor_id == VENDOR_ID_NV))
-      #else
-      if ((device_param->device_type & CL_DEVICE_TYPE_GPU) && ((device_param->platform_vendor_id == VENDOR_ID_NV) || (device_param->platform_vendor_id == VENDOR_ID_AMD)))
-      #endif
-      {
-        // OK, so the problem here is the following:
-        // There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device,
-        // but there's no way to ask for available memory on the device.
-        // In combination, most OpenCL runtimes implementation of clCreateBuffer()
-        // are doing so called lazy memory allocation on the device.
-        // Now, if the user has X11 (or a game or anything that takes a lot of GPU memory)
-        // running on the host we end up with an error type of this:
-        // clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE
-        // The clEnqueueNDRangeKernel() is because of the lazy allocation
-        // The best way to workaround this problem is if we would be able to ask for available memory,
-        // The idea here is to try to evaluate available memory by allocating it till it errors
-
-        cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem));
-
-        u64 c;
-
-        for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
-        {
-          if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
-
-          cl_int CL_err;
-
-          OCL_PTR *ocl = opencl_ctx->ocl;
-
-          tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
-
-          if (CL_err != CL_SUCCESS)
-          {
-            c--;
-
-            break;
-          }
-
-          // transfer only a few byte should be enough to force the runtime to actually allocate the memory
-
-          u8 tmp_host[8];
-
-          CL_err = ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL);
-
-          if (CL_err != CL_SUCCESS) break;
-
-          CL_err = ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL);
-
-          if (CL_err != CL_SUCCESS) break;
-
-          CL_err = ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL);
-
-          if (CL_err != CL_SUCCESS) break;
-
-          CL_err = ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL);
-
-          if (CL_err != CL_SUCCESS) break;
-        }
-
-        device_param->device_available_mem = c * MAX_ALLOC_CHECKS_SIZE;
-
-        // clean up
-
-        for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
-        {
-          if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
-
-          if (tmp_device[c] != NULL)
-          {
-            CL_rc = hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]);
-
-            if (CL_rc == -1) return -1;
-          }
-        }
-
-        hcfree (tmp_device);
-      }
-
-      hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
-
-      hc_clReleaseContext (hashcat_ctx, context);
-
-      // next please
-
-      devices_cnt++;
-    }
-  }
-
-  if (devices_active == 0)
-  {
-    event_log_error (hashcat_ctx, "No devices found/left.");
-
-    return -1;
-  }
-
-  // additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt)
-
-  if (opencl_ctx->devices_filter != (u64) -1)
-  {
-    const u64 devices_cnt_mask = ~(((u64) -1 >> devices_cnt) << devices_cnt);
-
-    if (opencl_ctx->devices_filter > devices_cnt_mask)
-    {
-      event_log_error (hashcat_ctx, "An invalid device was specified using the --opencl-devices parameter.");
-      event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", devices_cnt);
-
-      return -1;
-    }
-  }
-
-  opencl_ctx->target_msec     = TARGET_MSEC_PROFILE[user_options->workload_profile - 1];
-
-  opencl_ctx->devices_cnt     = devices_cnt;
-  opencl_ctx->devices_active  = devices_active;
-
-  opencl_ctx->need_adl        = need_adl;
-  opencl_ctx->need_nvml       = need_nvml;
-  opencl_ctx->need_nvapi      = need_nvapi;
-  opencl_ctx->need_sysfs      = need_sysfs;
-
-  opencl_ctx->comptime        = comptime;
-
-  return 0;
-}
-
-void opencl_ctx_devices_destroy (hashcat_ctx_t *hashcat_ctx)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  if (opencl_ctx->enabled == false) return;
-
-  for (u32 platform_id = 0; platform_id < opencl_ctx->platforms_cnt; platform_id++)
-  {
-    hcfree (opencl_ctx->platforms_vendor[platform_id]);
-    hcfree (opencl_ctx->platforms_name[platform_id]);
-    hcfree (opencl_ctx->platforms_version[platform_id]);
-  }
-
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
-  {
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
-
-    if (device_param->skipped == true) continue;
-
-    hcfree (device_param->device_name);
-    hcfree (device_param->device_version);
-    hcfree (device_param->driver_version);
-    hcfree (device_param->device_opencl_version);
-    hcfree (device_param->device_vendor);
-  }
-
-  opencl_ctx->devices_cnt    = 0;
-  opencl_ctx->devices_active = 0;
-
-  opencl_ctx->need_adl    = false;
-  opencl_ctx->need_nvml   = false;
-  opencl_ctx->need_nvapi  = false;
-  opencl_ctx->need_sysfs  = false;
-}
-
-static bool is_same_device_type (const hc_device_param_t *src, const hc_device_param_t *dst)
-{
-  if (strcmp (src->device_name,    dst->device_name)    != 0) return false;
-  if (strcmp (src->device_vendor,  dst->device_vendor)  != 0) return false;
-  if (strcmp (src->device_version, dst->device_version) != 0) return false;
-  if (strcmp (src->driver_version, dst->driver_version) != 0) return false;
-
-  if (src->device_processors         != dst->device_processors)         return false;
-  if (src->device_maxclock_frequency != dst->device_maxclock_frequency) return false;
-  if (src->device_maxworkgroup_size  != dst->device_maxworkgroup_size)  return false;
-
-  // memory size can be different, depending on which gpu has a monitor connected
-  // if (src->device_maxmem_alloc       != dst->device_maxmem_alloc)       return false;
-  // if (src->device_global_mem         != dst->device_global_mem)         return false;
-
-  if (src->sm_major != dst->sm_major) return false;
-  if (src->sm_minor != dst->sm_minor) return false;
-
-  if (src->kernel_exec_timeout != dst->kernel_exec_timeout) return false;
-
-  return true;
-}
-
-void opencl_ctx_devices_sync_tuning (hashcat_ctx_t *hashcat_ctx)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  if (opencl_ctx->enabled == false) return;
-
-  for (u32 device_id_src = 0; device_id_src < opencl_ctx->devices_cnt; device_id_src++)
-  {
-    hc_device_param_t *device_param_src = &opencl_ctx->devices_param[device_id_src];
-
-    if (device_param_src->skipped == true) continue;
-
-    if (device_param_src->skipped_warning == true) continue;
-
-    for (u32 device_id_dst = device_id_src; device_id_dst < opencl_ctx->devices_cnt; device_id_dst++)
-    {
-      hc_device_param_t *device_param_dst = &opencl_ctx->devices_param[device_id_dst];
-
-      if (device_param_dst->skipped == true) continue;
-
-      if (device_param_dst->skipped_warning == true) continue;
-
-      if (is_same_device_type (device_param_src, device_param_dst) == false) continue;
-
-      device_param_dst->kernel_accel   = device_param_src->kernel_accel;
-      device_param_dst->kernel_loops   = device_param_src->kernel_loops;
-      device_param_dst->kernel_threads = device_param_src->kernel_threads;
-
-      const u32 hardware_power = device_param_dst->device_processors * device_param_dst->kernel_threads;
-
-      device_param_dst->hardware_power = hardware_power;
-
-      const u32 kernel_power = device_param_dst->hardware_power * device_param_dst->kernel_accel;
-
-      device_param_dst->kernel_power = kernel_power;
-    }
-  }
-}
-
-void opencl_ctx_devices_update_power (hashcat_ctx_t *hashcat_ctx)
-{
-  opencl_ctx_t         *opencl_ctx          = hashcat_ctx->opencl_ctx;
-  status_ctx_t         *status_ctx          = hashcat_ctx->status_ctx;
-  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
-  user_options_t       *user_options        = hashcat_ctx->user_options;
-
-  if (opencl_ctx->enabled == false) return;
-
-  u32 kernel_power_all = 0;
-
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
-  {
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
-
-    if (device_param->skipped == true) continue;
-
-    if (device_param->skipped_warning == true) continue;
-
-    kernel_power_all += device_param->kernel_power;
-  }
-
-  opencl_ctx->kernel_power_all = kernel_power_all;
-
-  /*
-   * Inform user about possible slow speeds
-   */
-
-  if ((user_options_extra->wordlist_mode == WL_MODE_FILE) || (user_options_extra->wordlist_mode == WL_MODE_MASK))
-  {
-    if (status_ctx->words_base < kernel_power_all)
-    {
-      if (user_options->quiet == false)
-      {
-        event_log_advice (hashcat_ctx, "The wordlist or mask that you are using is too small.");
-        event_log_advice (hashcat_ctx, "This means that hashcat cannot use the full parallel power of your device(s).");
-        event_log_advice (hashcat_ctx, "Unless you supply more work, your cracking speed will drop.");
-        event_log_advice (hashcat_ctx, "For tips on supplying more work, see: https://hashcat.net/faq/morework");
-        event_log_advice (hashcat_ctx, NULL);
-      }
-    }
-  }
-}
-
-void opencl_ctx_devices_kernel_loops (hashcat_ctx_t *hashcat_ctx)
-{
-  combinator_ctx_t     *combinator_ctx      = hashcat_ctx->combinator_ctx;
-  hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
-  hashes_t             *hashes              = hashcat_ctx->hashes;
-  mask_ctx_t           *mask_ctx            = hashcat_ctx->mask_ctx;
-  opencl_ctx_t         *opencl_ctx          = hashcat_ctx->opencl_ctx;
-  straight_ctx_t       *straight_ctx        = hashcat_ctx->straight_ctx;
-  user_options_t       *user_options        = hashcat_ctx->user_options;
-  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
-
-  if (opencl_ctx->enabled == false) return;
-
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
-  {
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
-
-    if (device_param->skipped == true) continue;
-
-    if (device_param->skipped_warning == true) continue;
-
-    device_param->kernel_loops_min = device_param->kernel_loops_min_sav;
-    device_param->kernel_loops_max = device_param->kernel_loops_max_sav;
-
-    if (device_param->kernel_loops_min < device_param->kernel_loops_max)
-    {
-      u32 innerloop_cnt = 0;
-
-      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-      {
-        if (user_options->slow_candidates == true)
-        {
-          innerloop_cnt = 1;
-        }
-        else
-        {
-          if      (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)  innerloop_cnt = MIN (KERNEL_RULES, (u32) straight_ctx->kernel_rules_cnt);
-          else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)     innerloop_cnt = MIN (KERNEL_COMBS, (u32) combinator_ctx->combs_cnt);
-          else if (user_options_extra->attack_kern == ATTACK_KERN_BF)        innerloop_cnt = MIN (KERNEL_BFS,   (u32) mask_ctx->bfs_cnt);
-        }
-      }
-      else
-      {
-        innerloop_cnt = hashes->salts_buf[0].salt_iter;
-      }
-
-      if ((innerloop_cnt >= device_param->kernel_loops_min) &&
-          (innerloop_cnt <= device_param->kernel_loops_max))
-      {
-        device_param->kernel_loops_max = innerloop_cnt;
-      }
-    }
-  }
-}
-
-static int get_kernel_wgs (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u32 *result)
-{
-  int CL_rc;
-
-  size_t work_group_size;
-
-  CL_rc = hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (work_group_size), &work_group_size, NULL);
-
-  if (CL_rc == -1) return -1;
-
-  u32 kernel_threads = (u32) work_group_size;
-
-  size_t compile_work_group_size[3];
-
-  CL_rc = hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->device, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof (compile_work_group_size), &compile_work_group_size, NULL);
-
-  if (CL_rc == -1) return -1;
-
-  const size_t cwgs_total = compile_work_group_size[0] * compile_work_group_size[1] * compile_work_group_size[2];
-
-  if (cwgs_total > 0)
-  {
-    kernel_threads = MIN (kernel_threads, (u32) cwgs_total);
-  }
-
-  *result = kernel_threads;
-
-  return 0;
-}
-
-static int get_kernel_preferred_wgs_multiple (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u32 *result)
-{
-  int CL_rc;
-
-  size_t preferred_work_group_size_multiple;
-
-  CL_rc = hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof (preferred_work_group_size_multiple), &preferred_work_group_size_multiple, NULL);
-
-  if (CL_rc == -1) return -1;
-
-  *result = (u32) preferred_work_group_size_multiple;
-
-  return 0;
-}
-
-static int get_kernel_local_mem_size (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u64 *result)
-{
-  int CL_rc;
-
-  cl_ulong local_mem_size;
-
-  CL_rc = hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof (local_mem_size), &local_mem_size, NULL);
-
-  if (CL_rc == -1) return -1;
-
-  *result = local_mem_size;
-
-  return 0;
-}
-
-static u32 get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
-{
-  const hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
-
-  // a module can force a fixed value
-
-  u32 kernel_threads_min = device_param->kernel_threads_min;
-  u32 kernel_threads_max = device_param->kernel_threads_max;
-
-  // for CPU we just do 1 ...
-
-  if (device_param->device_type & CL_DEVICE_TYPE_CPU)
-  {
-    if ((1 >= kernel_threads_min) && (1 <= kernel_threads_max))
-    {
-      kernel_threads_min = 1;
-      kernel_threads_max = 1;
-    }
-  }
-
-  // this is an upper limit, a good start, since our strategy is to reduce thread counts only
-
-  const u32 device_maxworkgroup_size = (u32) device_param->device_maxworkgroup_size;
-
-  if (device_maxworkgroup_size < kernel_threads_max)
-  {
-    kernel_threads_max = device_maxworkgroup_size;
-  }
-
-  u32 kernel_threads = kernel_threads_max;
-
-  // complicated kernel tend to confuse OpenCL runtime suggestions for maximum thread size
-  // let's workaround that by sticking to their device specific preferred thread size
-
-  if (hashconfig->opts_type & OPTS_TYPE_PREFERED_THREAD)
-  {
-    if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-      {
-        if (device_param->kernel_preferred_wgs_multiple1)
-        {
-          const u32 kernel_preferred_wgs_multiple1 = device_param->kernel_preferred_wgs_multiple1;
-
-          if ((kernel_preferred_wgs_multiple1 >= kernel_threads_min) && (kernel_preferred_wgs_multiple1 <= kernel_threads_max))
-          {
-            kernel_threads = kernel_preferred_wgs_multiple1;
-          }
-        }
-      }
-      else
-      {
-        if (device_param->kernel_preferred_wgs_multiple4)
-        {
-          const u32 kernel_preferred_wgs_multiple4 = device_param->kernel_preferred_wgs_multiple4;
-
-          if ((kernel_preferred_wgs_multiple4 >= kernel_threads_min) && (kernel_preferred_wgs_multiple4 <= kernel_threads_max))
-          {
-            kernel_threads = kernel_preferred_wgs_multiple4;
-          }
-        }
-      }
-    }
-    else
-    {
-      if (device_param->kernel_preferred_wgs_multiple2)
-      {
-        const u32 kernel_preferred_wgs_multiple2 = device_param->kernel_preferred_wgs_multiple2;
-
-        if ((kernel_preferred_wgs_multiple2 >= kernel_threads_min) && (kernel_preferred_wgs_multiple2 <= kernel_threads_max))
-        {
-          kernel_threads = kernel_preferred_wgs_multiple2;
-        }
-      }
-    }
-  }
-  else
-  {
-    if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-      {
-        if (device_param->kernel_preferred_wgs_multiple1)
-        {
-          const u32 kernel_preferred_wgs_multiple1 = device_param->kernel_preferred_wgs_multiple1;
-
-          if ((kernel_preferred_wgs_multiple1 >= kernel_threads_min) && (kernel_preferred_wgs_multiple1 <= kernel_threads_max))
-          {
-            kernel_threads = kernel_preferred_wgs_multiple1;
-          }
-        }
-      }
-      else
-      {
-        if (device_param->kernel_preferred_wgs_multiple4)
-        {
-          const u32 kernel_preferred_wgs_multiple4 = device_param->kernel_preferred_wgs_multiple4;
-
-          if ((kernel_preferred_wgs_multiple4 >= kernel_threads_min) && (kernel_preferred_wgs_multiple4 <= kernel_threads_max))
-          {
-            kernel_threads = kernel_preferred_wgs_multiple4;
-          }
-        }
-      }
-    }
-    else
-    {
-      if (device_param->kernel_preferred_wgs_multiple2)
-      {
-        const u32 kernel_preferred_wgs_multiple2 = device_param->kernel_preferred_wgs_multiple2;
-
-        if ((kernel_preferred_wgs_multiple2 >= kernel_threads_min) && (kernel_preferred_wgs_multiple2 <= kernel_threads_max))
-        {
-          kernel_threads = kernel_preferred_wgs_multiple2;
-        }
-      }
-    }
-  }
-
-  return kernel_threads;
-}
-
-int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
-{
-  const bitmap_ctx_t         *bitmap_ctx          = hashcat_ctx->bitmap_ctx;
-  const folder_config_t      *folder_config       = hashcat_ctx->folder_config;
-  const hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
-  const hashes_t             *hashes              = hashcat_ctx->hashes;
-  const module_ctx_t         *module_ctx          = hashcat_ctx->module_ctx;
-        opencl_ctx_t         *opencl_ctx          = hashcat_ctx->opencl_ctx;
-  const straight_ctx_t       *straight_ctx        = hashcat_ctx->straight_ctx;
-  const user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
-  const user_options_t       *user_options        = hashcat_ctx->user_options;
-
-  if (opencl_ctx->enabled == false) return 0;
-
-  u32 hardware_power_all = 0;
-
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
-  {
-    int CL_rc = CL_SUCCESS;
-
-    /**
-     * host buffer
-     */
-
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
-
-    if (device_param->skipped == true) continue;
-
-    EVENT_DATA (EVENT_OPENCL_DEVICE_INIT_PRE, &device_id, sizeof (u32));
-
-    /**
-     * module depending checks
-     */
-
-    device_param->skipped_warning = false;
-
-    if (module_ctx->module_unstable_warning != MODULE_DEFAULT)
-    {
-      const bool unstable_warning = module_ctx->module_unstable_warning (hashconfig, user_options, user_options_extra, device_param);
-
-      if ((unstable_warning == true) && (user_options->force == false))
-      {
-        event_log_warning (hashcat_ctx, "* Device #%u: Skipping hash-mode %u - known OpenCL/Driver issue (not a hashcat issue)", device_id + 1, hashconfig->hash_mode);
-        event_log_warning (hashcat_ctx, "             You can use --force to override, but do not report related errors.");
-
-        device_param->skipped_warning = true;
-
-        continue;
-      }
-    }
-
-    // vector_width
-
-    cl_uint vector_width;
-
-    if (user_options->opencl_vector_width_chgd == false)
-    {
-      // tuning db
-
-      tuning_db_entry_t *tuningdb_entry;
-
-      if (user_options->slow_candidates == true)
-      {
-        tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->device_type, 0, hashconfig->hash_mode);
-      }
-      else
-      {
-        tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->device_type, user_options->attack_mode, hashconfig->hash_mode);
-      }
-
-      if (tuningdb_entry == NULL || tuningdb_entry->vector_width == -1)
-      {
-        if (hashconfig->opti_type & OPTI_TYPE_USES_BITS_64)
-        {
-          CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, sizeof (vector_width), &vector_width, NULL);
-
-          if (CL_rc == -1) return -1;
-        }
-        else
-        {
-          CL_rc = hc_clGetDeviceInfo (hashcat_ctx, device_param->device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT,  sizeof (vector_width), &vector_width, NULL);
-
-          if (CL_rc == -1) return -1;
-        }
-      }
-      else
-      {
-        vector_width = (cl_uint) tuningdb_entry->vector_width;
-      }
-    }
-    else
-    {
-      vector_width = user_options->opencl_vector_width;
-    }
-
-    // We can't have SIMD in kernels where we have an unknown final password length
-    // It also turns out that pure kernels (that have a higher register pressure)
-    // actually run faster on scalar GPU (like 1080) without SIMD
-
-    if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
-    {
-      if (device_param->device_type & CL_DEVICE_TYPE_GPU)
-      {
-        vector_width = 1;
-      }
-    }
-
-    if (vector_width > 16) vector_width = 16;
-
-    device_param->vector_width = vector_width;
-
-    /**
-     * kernel accel and loops tuning db adjustment
-     */
-
-    device_param->kernel_accel_min   = hashconfig->kernel_accel_min;
-    device_param->kernel_accel_max   = hashconfig->kernel_accel_max;
-    device_param->kernel_loops_min   = hashconfig->kernel_loops_min;
-    device_param->kernel_loops_max   = hashconfig->kernel_loops_max;
-    device_param->kernel_threads_min = hashconfig->kernel_threads_min;
-    device_param->kernel_threads_max = hashconfig->kernel_threads_max;
-
-    tuning_db_entry_t *tuningdb_entry = NULL;
-
-    if (user_options->slow_candidates == true)
-    {
-      tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->device_type, 0, hashconfig->hash_mode);
-    }
-    else
-    {
-      tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->device_type, user_options->attack_mode, hashconfig->hash_mode);
-    }
-
-    // user commandline option override tuning db
-    // but both have to stay inside the boundaries of the module
-
-    if (user_options->kernel_accel_chgd == true)
-    {
-      const u32 _kernel_accel = user_options->kernel_accel;
-
-      if ((_kernel_accel >= device_param->kernel_accel_min) && (_kernel_accel <= device_param->kernel_accel_max))
-      {
-        device_param->kernel_accel_min = _kernel_accel;
-        device_param->kernel_accel_max = _kernel_accel;
-      }
-    }
-    else
-    {
-      if (tuningdb_entry != NULL)
-      {
-        const u32 _kernel_accel = tuningdb_entry->kernel_accel;
-
-        if (_kernel_accel)
-        {
-          if ((_kernel_accel >= device_param->kernel_accel_min) && (_kernel_accel <= device_param->kernel_accel_max))
-          {
-            device_param->kernel_accel_min = _kernel_accel;
-            device_param->kernel_accel_max = _kernel_accel;
-          }
-        }
-      }
-    }
-
-    if (user_options->kernel_loops_chgd == true)
-    {
-      const u32 _kernel_loops = user_options->kernel_loops;
-
-      if ((_kernel_loops >= device_param->kernel_loops_min) && (_kernel_loops <= device_param->kernel_loops_max))
-      {
-        device_param->kernel_loops_min = _kernel_loops;
-        device_param->kernel_loops_max = _kernel_loops;
-      }
-    }
-    else
-    {
-      if (tuningdb_entry != NULL)
-      {
-        u32 _kernel_loops = tuningdb_entry->kernel_loops;
-
-        if (_kernel_loops)
-        {
-          if (user_options->workload_profile == 1)
-          {
-            _kernel_loops = (_kernel_loops > 8) ? _kernel_loops / 8 : 1;
-          }
-          else if (user_options->workload_profile == 2)
-          {
-            _kernel_loops = (_kernel_loops > 4) ? _kernel_loops / 4 : 1;
-          }
-
-          if ((_kernel_loops >= device_param->kernel_loops_min) && (_kernel_loops <= device_param->kernel_loops_max))
-          {
-            device_param->kernel_loops_min = _kernel_loops;
-            device_param->kernel_loops_max = _kernel_loops;
-          }
-        }
-      }
-    }
-
-    // there's not thread column in tuning db, stick to commandline if defined
-
-    if (user_options->kernel_threads_chgd == true)
-    {
-      const u32 _kernel_threads = user_options->kernel_threads;
-
-      if ((_kernel_threads >= device_param->kernel_threads_min) && (_kernel_threads <= device_param->kernel_threads_max))
-      {
-        device_param->kernel_threads_min = _kernel_threads;
-        device_param->kernel_threads_max = _kernel_threads;
-      }
-    }
-
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      // we have some absolute limits for fast hashes (because of limit constant memory), make sure not to overstep
-
-      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-      {
-        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-        {
-          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_RULES);
-          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_RULES);
-        }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
-        {
-          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_COMBS);
-          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_COMBS);
-        }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
-        {
-          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_BFS);
-          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_BFS);
-        }
-      }
-    }
-
-    device_param->kernel_loops_min_sav = device_param->kernel_loops_min;
-    device_param->kernel_loops_max_sav = device_param->kernel_loops_max;
-
-    /**
-     * device properties
-     */
-
-    const u32 device_processors = device_param->device_processors;
-
-    /**
-     * create context for each device
-     */
-
-    cl_context_properties properties[3];
-
-    properties[0] = CL_CONTEXT_PLATFORM;
-    properties[1] = (cl_context_properties) device_param->platform;
-    properties[2] = 0;
-
-    CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->device, NULL, NULL, &device_param->context);
-
-    if (CL_rc == -1) return -1;
-
-    /**
-     * create command-queue
-     */
-
-    // not supported with NV
-    // device_param->command_queue = hc_clCreateCommandQueueWithProperties (hashcat_ctx, device_param->device, NULL);
-
-    CL_rc = hc_clCreateCommandQueue (hashcat_ctx, device_param->context, device_param->device, CL_QUEUE_PROFILING_ENABLE, &device_param->command_queue);
-
-    if (CL_rc == -1) return -1;
-
-    /**
-     * create input buffers on device : calculate size of fixed memory buffers
-     */
-
-    u64 size_root_css   = SP_PW_MAX *           sizeof (cs_t);
-    u64 size_markov_css = SP_PW_MAX * CHARSIZ * sizeof (cs_t);
-
-    device_param->size_root_css   = size_root_css;
-    device_param->size_markov_css = size_markov_css;
-
-    u64 size_results = sizeof (u32);
-
-    device_param->size_results = size_results;
-
-    u64 size_rules   = (u64) straight_ctx->kernel_rules_cnt * sizeof (kernel_rule_t);
-    u64 size_rules_c = (u64) KERNEL_RULES                   * sizeof (kernel_rule_t);
-
-    device_param->size_rules    = size_rules;
-    device_param->size_rules_c  = size_rules_c;
-
-    u64 size_plains  = (u64) hashes->digests_cnt * sizeof (plain_t);
-    u64 size_salts   = (u64) hashes->salts_cnt   * sizeof (salt_t);
-    u64 size_esalts  = (u64) hashes->digests_cnt * (u64) hashconfig->esalt_size;
-    u64 size_shown   = (u64) hashes->digests_cnt * sizeof (u32);
-    u64 size_digests = (u64) hashes->digests_cnt * (u64) hashconfig->dgst_size;
-
-    device_param->size_plains   = size_plains;
-    device_param->size_digests  = size_digests;
-    device_param->size_shown    = size_shown;
-    device_param->size_salts    = size_salts;
-    device_param->size_esalts   = size_esalts;
-
-    u64 size_combs = KERNEL_COMBS * sizeof (pw_t);
-    u64 size_bfs   = KERNEL_BFS   * sizeof (bf_t);
-    u64 size_tm    = 32           * sizeof (bs_word_t);
-
-    device_param->size_bfs      = size_bfs;
-    device_param->size_combs    = size_combs;
-    device_param->size_tm       = size_tm;
-
-    u64 size_st_digests = 1 * hashconfig->dgst_size;
-    u64 size_st_salts   = 1 * sizeof (salt_t);
-    u64 size_st_esalts  = 1 * hashconfig->esalt_size;
-
-    device_param->size_st_digests = size_st_digests;
-    device_param->size_st_salts   = size_st_salts;
-    device_param->size_st_esalts  = size_st_esalts;
-
-    u64 size_extra_buffer = 4;
-
-    if (module_ctx->module_extra_buffer_size != MODULE_DEFAULT)
-    {
-      const u64 extra_buffer_size = module_ctx->module_extra_buffer_size (hashconfig, user_options, user_options_extra, hashes, device_param);
-
-      if (extra_buffer_size == (u64) -1)
-      {
-        event_log_error (hashcat_ctx, "Invalid extra buffer size.");
-
-        return -1;
-      }
-
-      device_param->extra_buffer_size = extra_buffer_size;
-
-      size_extra_buffer = extra_buffer_size;
-    }
-
-    // kern type
-
-    u32 kern_type = (u32) hashconfig->kern_type;
-
-    if (module_ctx->module_kern_type_dynamic != MODULE_DEFAULT)
-    {
-      if (user_options->benchmark == true)
-      {
-      }
-      else
-      {
-        void        *digests_buf    = hashes->digests_buf;
-        salt_t      *salts_buf      = hashes->salts_buf;
-        void        *esalts_buf     = hashes->esalts_buf;
-        void        *hook_salts_buf = hashes->hook_salts_buf;
-        hashinfo_t **hash_info      = hashes->hash_info;
-
-        hashinfo_t *hash_info_ptr = NULL;
-
-        if (hash_info) hash_info_ptr = hash_info[0];
-
-        kern_type = (u32) module_ctx->module_kern_type_dynamic (hashconfig, digests_buf, salts_buf, esalts_buf, hook_salts_buf, hash_info_ptr);
-      }
-    }
-
-    // built options
-
-    const size_t build_options_sz = 4096;
-
-    char *build_options_buf = (char *) hcmalloc (build_options_sz);
-
-    int build_options_len = 0;
-
-    #if defined (_WIN)
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D KERNEL_STATIC -I OpenCL -I \"%s\" ", folder_config->cpath_real);
-    #else
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D KERNEL_STATIC -I OpenCL -I %s ", folder_config->cpath_real);
-    #endif
-
-    // we don't have sm_* on vendors not NV but it doesn't matter
-
-    #if defined (DEBUG)
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll ", device_param->device_local_mem_type, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type);
-    #else
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -w ", device_param->device_local_mem_type, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type);
-    #endif
-
-    build_options_buf[build_options_len] = 0;
-
-    /*
-    if (device_param->device_type & CL_DEVICE_TYPE_CPU)
-    {
-      if (device_param->platform_vendor_id == VENDOR_ID_INTEL_SDK)
-      {
-        strncat (build_options_buf, " -cl-opt-disable", 16);
-      }
-    }
-    */
-
-    char *build_options_module_buf = (char *) hcmalloc (build_options_sz);
-
-    int build_options_module_len = 0;
-
-    build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s ", build_options_buf);
-
-    if (module_ctx->module_jit_build_options != MODULE_DEFAULT)
-    {
-      char *jit_build_options = module_ctx->module_jit_build_options (hashconfig, user_options, user_options_extra, hashes, device_param);
-
-      if (jit_build_options != NULL)
-      {
-        build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s", jit_build_options);
-      }
-    }
-
-    build_options_module_buf[build_options_module_len] = 0;
-
-    #if defined (DEBUG)
-    if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_options '%s'", device_id + 1, build_options_buf);
-    if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_options_module '%s'", device_id + 1, build_options_module_buf);
-    #endif
-
-    /**
-     * device_name_chksum
-     */
-
-    char *device_name_chksum        = (char *) hcmalloc (HCBUFSIZ_TINY);
-    char *device_name_chksum_amp_mp = (char *) hcmalloc (HCBUFSIZ_TINY);
-
-    const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%u-%s-%s-%s-%d-%u",
-      opencl_ctx->comptime,
-      device_param->platform_vendor_id,
-      device_param->device_name,
-      device_param->device_version,
-      device_param->driver_version,
-      device_param->vector_width,
-      hashconfig->kern_type);
-
-    const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%u-%s-%s-%s",
-      opencl_ctx->comptime,
-      device_param->platform_vendor_id,
-      device_param->device_name,
-      device_param->device_version,
-      device_param->driver_version);
-
-    md5_ctx_t md5_ctx;
-
-    md5_init (&md5_ctx);
-    md5_update (&md5_ctx, (u32 *) device_name_chksum, dnclen);
-    md5_final (&md5_ctx);
-
-    snprintf (device_name_chksum, HCBUFSIZ_TINY, "%08x", md5_ctx.h[0]);
-
-    md5_init (&md5_ctx);
-    md5_update (&md5_ctx, (u32 *) device_name_chksum_amp_mp, dnclen_amp_mp);
-    md5_final (&md5_ctx);
-
-    snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%08x", md5_ctx.h[0]);
-
-    /**
-     * kernel cache
-     */
-
-    bool cache_disable = false;
-
-    // Seems to be completely broken on Apple + (Intel?) CPU
-    // To reproduce set cache_disable to false and run benchmark -b
-
-    if (device_param->platform_vendor_id == VENDOR_ID_APPLE)
-    {
-      if (device_param->device_type & CL_DEVICE_TYPE_CPU)
-      {
-        cache_disable = true;
-      }
-    }
-
-    if (module_ctx->module_jit_cache_disable != MODULE_DEFAULT)
-    {
-      cache_disable = module_ctx->module_jit_cache_disable (hashconfig, user_options, user_options_extra, hashes, device_param);
-    }
-
-    /**
-     * main kernel
-     */
-
-    {
-      /**
-       * kernel source filename
-       */
-
-      char source_file[256] = { 0 };
-
-      generate_source_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->shared_dir, source_file);
-
-      if (hc_path_read (source_file) == false)
-      {
-        event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
-
-        return -1;
-      }
-
-      /**
-       * kernel cached filename
-       */
-
-      char cached_file[256] = { 0 };
-
-      generate_cached_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->profile_dir, device_name_chksum, cached_file);
-
-      bool cached = true;
-
-      if (cache_disable == true)
-      {
-        cached = false;
-      }
-
-      if (hc_path_read (cached_file) == false)
-      {
-        cached = false;
-      }
-
-      if (hc_path_is_empty (cached_file) == true)
-      {
-        cached = false;
-      }
-
-      /**
-       * kernel compile or load
-       */
-
-      size_t kernel_lengths_buf = 0;
-
-      size_t *kernel_lengths = &kernel_lengths_buf;
-
-      char *kernel_sources_buf = NULL;
-
-      char **kernel_sources = &kernel_sources_buf;
-
-      if (cached == false)
-      {
-        #if defined (DEBUG)
-        if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Kernel %s not found in cache! Building may take a while...", device_id + 1, filename_from_filepath (cached_file));
-        #endif
-
-        const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, source_file, kernel_lengths, kernel_sources, true);
-
-        if (rc_read_kernel == false) return -1;
-
-        CL_rc = hc_clCreateProgramWithSource (hashcat_ctx, device_param->context, 1, (const char **) kernel_sources, NULL, &device_param->program);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->program, 1, &device_param->device, build_options_module_buf, NULL, NULL);
-
-        //if (CL_rc == -1) return -1;
-
-        size_t build_log_size = 0;
-
-        hc_clGetProgramBuildInfo (hashcat_ctx, device_param->program, device_param->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
-
-        //if (CL_rc == -1) return -1;
-
-        #if defined (DEBUG)
-        if ((build_log_size > 1) || (CL_rc == -1))
-        #else
-        if (CL_rc == -1)
-        #endif
-        {
-          char *build_log = (char *) hcmalloc (build_log_size + 1);
-
-          int CL_rc_build = hc_clGetProgramBuildInfo (hashcat_ctx, device_param->program, device_param->device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
-
-          if (CL_rc_build == -1) return -1;
-
-          puts (build_log);
-
-          hcfree (build_log);
-        }
-
-        if (CL_rc == -1)
-        {
-          device_param->skipped_warning = true;
-
-          event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed - proceeding without this device.", device_id + 1, source_file);
-
-          continue;
-        }
-
-        if (cache_disable == false)
-        {
-          size_t binary_size;
-
-          CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->program, CL_PROGRAM_BINARY_SIZES, sizeof (size_t), &binary_size, NULL);
-
-          if (CL_rc == -1) return -1;
-
-          char *binary = (char *) hcmalloc (binary_size);
-
-          CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->program, CL_PROGRAM_BINARIES, sizeof (char *), &binary, NULL);
-
-          if (CL_rc == -1) return -1;
-
-          const bool rc_write = write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
-
-          if (rc_write == false) return -1;
-
-          hcfree (binary);
-        }
-      }
-      else
-      {
-        const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, cached_file, kernel_lengths, kernel_sources, false);
-
-        if (rc_read_kernel == false) return -1;
-
-        CL_rc = hc_clCreateProgramWithBinary (hashcat_ctx, device_param->context, 1, &device_param->device, kernel_lengths, (const unsigned char **) kernel_sources, NULL, &device_param->program);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->program, 1, &device_param->device, build_options_module_buf, NULL, NULL);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      hcfree (kernel_sources[0]);
-    }
-
-    hcfree (build_options_module_buf);
-
-    /**
-     * word generator kernel
-     */
-
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      if (user_options->attack_mode != ATTACK_MODE_STRAIGHT)
-      {
-        /**
-         * kernel mp source filename
-         */
-
-        char source_file[256] = { 0 };
-
-        generate_source_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->shared_dir, source_file);
-
-        if (hc_path_read (source_file) == false)
-        {
-          event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
-
-          return -1;
-        }
-
-        /**
-         * kernel mp cached filename
-         */
-
-        char cached_file[256] = { 0 };
-
-        generate_cached_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file);
-
-        bool cached = true;
-
-        if (cache_disable == true)
-        {
-          cached = false;
-        }
-
-        if (hc_path_read (cached_file) == false)
-        {
-          cached = false;
-        }
-
-        if (hc_path_is_empty (cached_file) == true)
-        {
-          cached = false;
-        }
-
-        /**
-         * kernel compile or load
-         */
-
-        size_t kernel_lengths_buf = 0;
-
-        size_t *kernel_lengths = &kernel_lengths_buf;
-
-        char *kernel_sources_buf = NULL;
-
-        char **kernel_sources = &kernel_sources_buf;
-
-        if (cached == false)
-        {
-          #if defined (DEBUG)
-          if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Kernel %s not found in cache! Building may take a while...", device_id + 1, filename_from_filepath (cached_file));
-          #endif
-
-          const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, source_file, kernel_lengths, kernel_sources, true);
-
-          if (rc_read_kernel == false) return -1;
-
-          CL_rc = hc_clCreateProgramWithSource (hashcat_ctx, device_param->context, 1, (const char **) kernel_sources, NULL, &device_param->program_mp);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->program_mp, 1, &device_param->device, build_options_buf, NULL, NULL);
-
-          //if (CL_rc == -1) return -1;
-
-          size_t build_log_size = 0;
-
-          hc_clGetProgramBuildInfo (hashcat_ctx, device_param->program_mp, device_param->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
-
-          //if (CL_rc == -1) return -1;
-
-          #if defined (DEBUG)
-          if ((build_log_size > 1) || (CL_rc == -1))
-          #else
-          if (CL_rc == -1)
-          #endif
-          {
-            char *build_log = (char *) hcmalloc (build_log_size + 1);
-
-            int CL_rc_build = hc_clGetProgramBuildInfo (hashcat_ctx, device_param->program_mp, device_param->device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
-
-            if (CL_rc_build == -1) return -1;
-
-            puts (build_log);
-
-            hcfree (build_log);
-          }
-
-          if (CL_rc == -1)
-          {
-            device_param->skipped_warning = true;
-
-            event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed - proceeding without this device.", device_id + 1, source_file);
-
-            continue;
-          }
-
-          if (cache_disable == false)
-          {
-            size_t binary_size;
-
-            CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->program_mp, CL_PROGRAM_BINARY_SIZES, sizeof (size_t), &binary_size, NULL);
-
-            if (CL_rc == -1) return -1;
-
-            char *binary = (char *) hcmalloc (binary_size);
-
-            CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->program_mp, CL_PROGRAM_BINARIES, sizeof (char *), &binary, NULL);
-
-            if (CL_rc == -1) return -1;
-
-            write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
-
-            hcfree (binary);
-          }
-        }
-        else
-        {
-          const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, cached_file, kernel_lengths, kernel_sources, false);
-
-          if (rc_read_kernel == false) return -1;
-
-          CL_rc = hc_clCreateProgramWithBinary (hashcat_ctx, device_param->context, 1, &device_param->device, kernel_lengths, (const unsigned char **) kernel_sources, NULL, &device_param->program_mp);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->program_mp, 1, &device_param->device, build_options_buf, NULL, NULL);
-
-          if (CL_rc == -1) return -1;
-        }
-
-        hcfree (kernel_sources[0]);
-      }
-    }
-
-    /**
-     * amplifier kernel
-     */
-
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-      {
-
-      }
-      else
-      {
-        /**
-         * kernel amp source filename
-         */
-
-        char source_file[256] = { 0 };
-
-        generate_source_kernel_amp_filename (user_options_extra->attack_kern, folder_config->shared_dir, source_file);
-
-        if (hc_path_read (source_file) == false)
-        {
-          event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
-
-          return -1;
-        }
-
-        /**
-         * kernel amp cached filename
-         */
-
-        char cached_file[256] = { 0 };
-
-        generate_cached_kernel_amp_filename (user_options_extra->attack_kern, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file);
-
-        bool cached = true;
-
-        if (cache_disable == true)
-        {
-          cached = false;
-        }
-
-        if (hc_path_read (cached_file) == false)
-        {
-          cached = false;
-        }
-
-        if (hc_path_is_empty (cached_file) == true)
-        {
-          cached = false;
-        }
-
-        /**
-         * kernel compile or load
-         */
-
-        size_t kernel_lengths_buf = 0;
-
-        size_t *kernel_lengths = &kernel_lengths_buf;
-
-        char *kernel_sources_buf = NULL;
-
-        char **kernel_sources = &kernel_sources_buf;
-
-        if (cached == false)
-        {
-          #if defined (DEBUG)
-          if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Kernel %s not found in cache! Building may take a while...", device_id + 1, filename_from_filepath (cached_file));
-          #endif
-
-          const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, source_file, kernel_lengths, kernel_sources, true);
-
-          if (rc_read_kernel == false) return -1;
-
-          CL_rc = hc_clCreateProgramWithSource (hashcat_ctx, device_param->context, 1, (const char **) kernel_sources, NULL, &device_param->program_amp);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->program_amp, 1, &device_param->device, build_options_buf, NULL, NULL);
-
-          //if (CL_rc == -1) return -1;
-
-          size_t build_log_size = 0;
-
-          hc_clGetProgramBuildInfo (hashcat_ctx, device_param->program_amp, device_param->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
-
-          //if (CL_rc == -1) return -1;
-
-          #if defined (DEBUG)
-          if ((build_log_size > 1) || (CL_rc == -1))
-          #else
-          if (CL_rc == -1)
-          #endif
-          {
-            char *build_log = (char *) hcmalloc (build_log_size + 1);
-
-            int CL_rc_build_info = hc_clGetProgramBuildInfo (hashcat_ctx, device_param->program_amp, device_param->device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
-
-            if (CL_rc_build_info == -1) return -1;
-
-            puts (build_log);
-
-            hcfree (build_log);
-          }
-
-          if (CL_rc == -1)
-          {
-            device_param->skipped_warning = true;
-
-            event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed - proceeding without this device.", device_id + 1, source_file);
-
-            continue;
-          }
-
-          if (cache_disable == false)
-          {
-            size_t binary_size;
-
-            CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->program_amp, CL_PROGRAM_BINARY_SIZES, sizeof (size_t), &binary_size, NULL);
-
-            if (CL_rc == -1) return -1;
-
-            char *binary = (char *) hcmalloc (binary_size);
-
-            CL_rc = hc_clGetProgramInfo (hashcat_ctx, device_param->program_amp, CL_PROGRAM_BINARIES, sizeof (char *), &binary, NULL);
-
-            if (CL_rc == -1) return -1;
-
-            write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
-
-            hcfree (binary);
-          }
-        }
-        else
-        {
-          const bool rc_read_kernel = read_kernel_binary (hashcat_ctx, cached_file, kernel_lengths, kernel_sources, false);
-
-          if (rc_read_kernel == false) return -1;
-
-          CL_rc = hc_clCreateProgramWithBinary (hashcat_ctx, device_param->context, 1, &device_param->device, kernel_lengths, (const unsigned char **) kernel_sources, NULL, &device_param->program_amp);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = hc_clBuildProgram (hashcat_ctx, device_param->program_amp, 1, &device_param->device, build_options_buf, NULL, NULL);
-
-          if (CL_rc == -1) return -1;
-        }
-
-        hcfree (kernel_sources[0]);
-
-        hcfree (build_options_buf);
-      }
-    }
-
-    hcfree (device_name_chksum);
-    hcfree (device_name_chksum_amp_mp);
-
-    // some algorithm collide too fast, make that impossible
-
-    if (user_options->benchmark == true)
-    {
-      ((u32 *) hashes->digests_buf)[0] = -1u;
-      ((u32 *) hashes->digests_buf)[1] = -1u;
-      ((u32 *) hashes->digests_buf)[2] = -1u;
-      ((u32 *) hashes->digests_buf)[3] = -1u;
-    }
-
-    /**
-     * global buffers
-     */
-
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->d_bitmap_s1_a);    if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->d_bitmap_s1_b);    if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->d_bitmap_s1_c);    if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->d_bitmap_s1_d);    if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->d_bitmap_s2_a);    if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->d_bitmap_s2_b);    if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->d_bitmap_s2_c);    if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->d_bitmap_s2_d);    if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE,  size_plains,             NULL, &device_param->d_plain_bufs);     if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   size_digests,            NULL, &device_param->d_digests_buf);    if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE,  size_shown,              NULL, &device_param->d_digests_shown);  if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   size_salts,              NULL, &device_param->d_salt_bufs);      if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE,  size_results,            NULL, &device_param->d_result);         if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->d_extra0_buf);     if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->d_extra1_buf);     if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->d_extra2_buf);     if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->d_extra3_buf);     if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   size_st_digests,         NULL, &device_param->d_st_digests_buf); if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   size_st_salts,           NULL, &device_param->d_st_salts_buf);   if (CL_rc == -1) return -1;
-
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_bitmap_s1_a,     CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_a, 0, NULL, NULL); if (CL_rc == -1) return -1;
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_bitmap_s1_b,     CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_b, 0, NULL, NULL); if (CL_rc == -1) return -1;
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_bitmap_s1_c,     CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_c, 0, NULL, NULL); if (CL_rc == -1) return -1;
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_bitmap_s1_d,     CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_d, 0, NULL, NULL); if (CL_rc == -1) return -1;
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_bitmap_s2_a,     CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_a, 0, NULL, NULL); if (CL_rc == -1) return -1;
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_bitmap_s2_b,     CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_b, 0, NULL, NULL); if (CL_rc == -1) return -1;
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_bitmap_s2_c,     CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_c, 0, NULL, NULL); if (CL_rc == -1) return -1;
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_bitmap_s2_d,     CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_d, 0, NULL, NULL); if (CL_rc == -1) return -1;
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_digests_buf,     CL_TRUE, 0, size_digests,            hashes->digests_buf,     0, NULL, NULL); if (CL_rc == -1) return -1;
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_salt_bufs,       CL_TRUE, 0, size_salts,              hashes->salts_buf,       0, NULL, NULL); if (CL_rc == -1) return -1;
-
-    /**
-     * special buffers
-     */
-
-    if (user_options->slow_candidates == true)
-    {
-      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_rules_c, NULL, &device_param->d_rules_c); if (CL_rc == -1) return -1;
-    }
-    else
-    {
-      if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-      {
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_rules,   NULL, &device_param->d_rules);   if (CL_rc == -1) return -1;
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_rules_c, NULL, &device_param->d_rules_c); if (CL_rc == -1) return -1;
-
-        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_rules, CL_TRUE, 0, size_rules, straight_ctx->kernel_rules_buf, 0, NULL, NULL); if (CL_rc == -1) return -1;
-      }
-      else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
-      {
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_combs,      NULL, &device_param->d_combs);          if (CL_rc == -1) return -1;
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_combs,      NULL, &device_param->d_combs_c);        if (CL_rc == -1) return -1;
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_root_css,   NULL, &device_param->d_root_css_buf);   if (CL_rc == -1) return -1;
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_markov_css, NULL, &device_param->d_markov_css_buf); if (CL_rc == -1) return -1;
-      }
-      else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
-      {
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_bfs,        NULL, &device_param->d_bfs);            if (CL_rc == -1) return -1;
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_bfs,        NULL, &device_param->d_bfs_c);          if (CL_rc == -1) return -1;
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_tm,         NULL, &device_param->d_tm_c);           if (CL_rc == -1) return -1;
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_root_css,   NULL, &device_param->d_root_css_buf);   if (CL_rc == -1) return -1;
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_markov_css, NULL, &device_param->d_markov_css_buf); if (CL_rc == -1) return -1;
-      }
-    }
-
-    if (size_esalts)
-    {
-      CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_esalts, NULL, &device_param->d_esalt_bufs);
-
-      if (CL_rc == -1) return -1;
-
-      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_esalt_bufs, CL_TRUE, 0, size_esalts, hashes->esalts_buf, 0, NULL, NULL);
-
-      if (CL_rc == -1) return -1;
-    }
-
-    if (hashconfig->st_hash != NULL)
-    {
-      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_st_digests_buf,  CL_TRUE, 0, size_st_digests,         hashes->st_digests_buf,  0, NULL, NULL); if (CL_rc == -1) return -1;
-      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_st_salts_buf,    CL_TRUE, 0, size_st_salts,           hashes->st_salts_buf,    0, NULL, NULL); if (CL_rc == -1) return -1;
-
-      if (size_esalts)
-      {
-        CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_st_esalts, NULL, &device_param->d_st_esalts_buf);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_st_esalts_buf, CL_TRUE, 0, size_st_esalts, hashes->st_esalts_buf, 0, NULL, NULL);
-
-        if (CL_rc == -1) return -1;
-      }
-    }
-
-    /**
-     * kernel args
-     */
-
-    device_param->kernel_params_buf32[24] = bitmap_ctx->bitmap_mask;
-    device_param->kernel_params_buf32[25] = bitmap_ctx->bitmap_shift1;
-    device_param->kernel_params_buf32[26] = bitmap_ctx->bitmap_shift2;
-    device_param->kernel_params_buf32[27] = 0; // salt_pos
-    device_param->kernel_params_buf32[28] = 0; // loop_pos
-    device_param->kernel_params_buf32[29] = 0; // loop_cnt
-    device_param->kernel_params_buf32[30] = 0; // kernel_rules_cnt
-    device_param->kernel_params_buf32[31] = 0; // digests_cnt
-    device_param->kernel_params_buf32[32] = 0; // digests_offset
-    device_param->kernel_params_buf32[33] = 0; // combs_mode
-    device_param->kernel_params_buf64[34] = 0; // gid_max
-
-    device_param->kernel_params[ 0] = NULL; // &device_param->d_pws_buf;
-    device_param->kernel_params[ 1] = &device_param->d_rules_c;
-    device_param->kernel_params[ 2] = &device_param->d_combs_c;
-    device_param->kernel_params[ 3] = &device_param->d_bfs_c;
-    device_param->kernel_params[ 4] = NULL; // &device_param->d_tmps;
-    device_param->kernel_params[ 5] = NULL; // &device_param->d_hooks;
-    device_param->kernel_params[ 6] = &device_param->d_bitmap_s1_a;
-    device_param->kernel_params[ 7] = &device_param->d_bitmap_s1_b;
-    device_param->kernel_params[ 8] = &device_param->d_bitmap_s1_c;
-    device_param->kernel_params[ 9] = &device_param->d_bitmap_s1_d;
-    device_param->kernel_params[10] = &device_param->d_bitmap_s2_a;
-    device_param->kernel_params[11] = &device_param->d_bitmap_s2_b;
-    device_param->kernel_params[12] = &device_param->d_bitmap_s2_c;
-    device_param->kernel_params[13] = &device_param->d_bitmap_s2_d;
-    device_param->kernel_params[14] = &device_param->d_plain_bufs;
-    device_param->kernel_params[15] = &device_param->d_digests_buf;
-    device_param->kernel_params[16] = &device_param->d_digests_shown;
-    device_param->kernel_params[17] = &device_param->d_salt_bufs;
-    device_param->kernel_params[18] = &device_param->d_esalt_bufs;
-    device_param->kernel_params[19] = &device_param->d_result;
-    device_param->kernel_params[20] = &device_param->d_extra0_buf;
-    device_param->kernel_params[21] = &device_param->d_extra1_buf;
-    device_param->kernel_params[22] = &device_param->d_extra2_buf;
-    device_param->kernel_params[23] = &device_param->d_extra3_buf;
-    device_param->kernel_params[24] = &device_param->kernel_params_buf32[24];
-    device_param->kernel_params[25] = &device_param->kernel_params_buf32[25];
-    device_param->kernel_params[26] = &device_param->kernel_params_buf32[26];
-    device_param->kernel_params[27] = &device_param->kernel_params_buf32[27];
-    device_param->kernel_params[28] = &device_param->kernel_params_buf32[28];
-    device_param->kernel_params[29] = &device_param->kernel_params_buf32[29];
-    device_param->kernel_params[30] = &device_param->kernel_params_buf32[30];
-    device_param->kernel_params[31] = &device_param->kernel_params_buf32[31];
-    device_param->kernel_params[32] = &device_param->kernel_params_buf32[32];
-    device_param->kernel_params[33] = &device_param->kernel_params_buf32[33];
-    device_param->kernel_params[34] = &device_param->kernel_params_buf64[34];
-
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      device_param->kernel_params_mp_buf64[3] = 0;
-      device_param->kernel_params_mp_buf32[4] = 0;
-      device_param->kernel_params_mp_buf32[5] = 0;
-      device_param->kernel_params_mp_buf32[6] = 0;
-      device_param->kernel_params_mp_buf32[7] = 0;
-      device_param->kernel_params_mp_buf64[8] = 0;
-
-      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-      {
-        device_param->kernel_params_mp[0] = &device_param->d_combs;
-      }
-      else
-      {
-        if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
-        {
-          device_param->kernel_params_mp[0] = &device_param->d_combs;
-        }
-        else
-        {
-          device_param->kernel_params_mp[0] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-                                                    // ? &device_param->d_pws_buf
-                                                    // : &device_param->d_pws_amp_buf;
-        }
-      }
-
-      device_param->kernel_params_mp[1] = &device_param->d_root_css_buf;
-      device_param->kernel_params_mp[2] = &device_param->d_markov_css_buf;
-      device_param->kernel_params_mp[3] = &device_param->kernel_params_mp_buf64[3];
-      device_param->kernel_params_mp[4] = &device_param->kernel_params_mp_buf32[4];
-      device_param->kernel_params_mp[5] = &device_param->kernel_params_mp_buf32[5];
-      device_param->kernel_params_mp[6] = &device_param->kernel_params_mp_buf32[6];
-      device_param->kernel_params_mp[7] = &device_param->kernel_params_mp_buf32[7];
-      device_param->kernel_params_mp[8] = &device_param->kernel_params_mp_buf64[8];
-
-      device_param->kernel_params_mp_l_buf64[3] = 0;
-      device_param->kernel_params_mp_l_buf32[4] = 0;
-      device_param->kernel_params_mp_l_buf32[5] = 0;
-      device_param->kernel_params_mp_l_buf32[6] = 0;
-      device_param->kernel_params_mp_l_buf32[7] = 0;
-      device_param->kernel_params_mp_l_buf32[8] = 0;
-      device_param->kernel_params_mp_l_buf64[9] = 0;
-
-      device_param->kernel_params_mp_l[0] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-                                                  // ? &device_param->d_pws_buf
-                                                  // : &device_param->d_pws_amp_buf;
-      device_param->kernel_params_mp_l[1] = &device_param->d_root_css_buf;
-      device_param->kernel_params_mp_l[2] = &device_param->d_markov_css_buf;
-      device_param->kernel_params_mp_l[3] = &device_param->kernel_params_mp_l_buf64[3];
-      device_param->kernel_params_mp_l[4] = &device_param->kernel_params_mp_l_buf32[4];
-      device_param->kernel_params_mp_l[5] = &device_param->kernel_params_mp_l_buf32[5];
-      device_param->kernel_params_mp_l[6] = &device_param->kernel_params_mp_l_buf32[6];
-      device_param->kernel_params_mp_l[7] = &device_param->kernel_params_mp_l_buf32[7];
-      device_param->kernel_params_mp_l[8] = &device_param->kernel_params_mp_l_buf32[8];
-      device_param->kernel_params_mp_l[9] = &device_param->kernel_params_mp_l_buf64[9];
-
-      device_param->kernel_params_mp_r_buf64[3] = 0;
-      device_param->kernel_params_mp_r_buf32[4] = 0;
-      device_param->kernel_params_mp_r_buf32[5] = 0;
-      device_param->kernel_params_mp_r_buf32[6] = 0;
-      device_param->kernel_params_mp_r_buf32[7] = 0;
-      device_param->kernel_params_mp_r_buf64[8] = 0;
-
-      device_param->kernel_params_mp_r[0] = &device_param->d_bfs;
-      device_param->kernel_params_mp_r[1] = &device_param->d_root_css_buf;
-      device_param->kernel_params_mp_r[2] = &device_param->d_markov_css_buf;
-      device_param->kernel_params_mp_r[3] = &device_param->kernel_params_mp_r_buf64[3];
-      device_param->kernel_params_mp_r[4] = &device_param->kernel_params_mp_r_buf32[4];
-      device_param->kernel_params_mp_r[5] = &device_param->kernel_params_mp_r_buf32[5];
-      device_param->kernel_params_mp_r[6] = &device_param->kernel_params_mp_r_buf32[6];
-      device_param->kernel_params_mp_r[7] = &device_param->kernel_params_mp_r_buf32[7];
-      device_param->kernel_params_mp_r[8] = &device_param->kernel_params_mp_r_buf64[8];
-
-      device_param->kernel_params_amp_buf32[5] = 0; // combs_mode
-      device_param->kernel_params_amp_buf64[6] = 0; // gid_max
-
-      device_param->kernel_params_amp[0] = NULL; // &device_param->d_pws_buf;
-      device_param->kernel_params_amp[1] = NULL; // &device_param->d_pws_amp_buf;
-      device_param->kernel_params_amp[2] = &device_param->d_rules_c;
-      device_param->kernel_params_amp[3] = &device_param->d_combs_c;
-      device_param->kernel_params_amp[4] = &device_param->d_bfs_c;
-      device_param->kernel_params_amp[5] = &device_param->kernel_params_amp_buf32[5];
-      device_param->kernel_params_amp[6] = &device_param->kernel_params_amp_buf64[6];
-
-      device_param->kernel_params_tm[0] = &device_param->d_bfs_c;
-      device_param->kernel_params_tm[1] = &device_param->d_tm_c;
-    }
-
-    device_param->kernel_params_memset_buf32[1] = 0; // value
-    device_param->kernel_params_memset_buf64[2] = 0; // gid_max
-
-    device_param->kernel_params_memset[0] = NULL;
-    device_param->kernel_params_memset[1] = &device_param->kernel_params_memset_buf32[1];
-    device_param->kernel_params_memset[2] = &device_param->kernel_params_memset_buf64[2];
-
-    device_param->kernel_params_atinit_buf64[1] = 0; // gid_max
-
-    device_param->kernel_params_atinit[0] = NULL;
-    device_param->kernel_params_atinit[1] = &device_param->kernel_params_atinit_buf64[1];
-
-    device_param->kernel_params_decompress_buf64[3] = 0; // gid_max
-
-    device_param->kernel_params_decompress[0] = NULL; // &device_param->d_pws_idx;
-    device_param->kernel_params_decompress[1] = NULL; // &device_param->d_pws_comp_buf;
-    device_param->kernel_params_decompress[2] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-                                                      // ? &device_param->d_pws_buf
-                                                      // : &device_param->d_pws_amp_buf;
-    device_param->kernel_params_decompress[3] = &device_param->kernel_params_decompress_buf64[3];
-
-    /**
-     * kernel name
-     */
-
-    char kernel_name[64] = { 0 };
-
-    if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SINGLE_HASH)
-      {
-        if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-        {
-          // kernel1
-
-          snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 4);
-
-          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel1);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel1, &device_param->kernel_wgs1);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel1, &device_param->kernel_local_mem_size1);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel1, &device_param->kernel_preferred_wgs_multiple1);
-
-          if (CL_rc == -1) return -1;
-
-          // kernel2
-
-          snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 8);
-
-          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel2);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel2, &device_param->kernel_wgs2);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel2, &device_param->kernel_local_mem_size2);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel2, &device_param->kernel_preferred_wgs_multiple2);
-
-          if (CL_rc == -1) return -1;
-
-          // kernel3
-
-          snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 16);
-
-          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel3);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel3, &device_param->kernel_wgs3);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel3, &device_param->kernel_local_mem_size3);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel3, &device_param->kernel_preferred_wgs_multiple3);
-
-          if (CL_rc == -1) return -1;
-        }
-        else
-        {
-          snprintf (kernel_name, sizeof (kernel_name), "m%05u_sxx", kern_type);
-
-          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel4);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel4, &device_param->kernel_wgs4);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel4, &device_param->kernel_local_mem_size4);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel4, &device_param->kernel_preferred_wgs_multiple4);
-
-          if (CL_rc == -1) return -1;
-        }
-      }
-      else
-      {
-        if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-        {
-          // kernel1
-
-          snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 4);
-
-          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel1);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel1, &device_param->kernel_wgs1);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel1, &device_param->kernel_local_mem_size1);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel1, &device_param->kernel_preferred_wgs_multiple1);
-
-          if (CL_rc == -1) return -1;
-
-          // kernel2
-
-          snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 8);
-
-          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel2);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel2, &device_param->kernel_wgs2);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel2, &device_param->kernel_local_mem_size2);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel2, &device_param->kernel_preferred_wgs_multiple2);
-
-          if (CL_rc == -1) return -1;
-
-          // kernel3
-
-          snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 16);
-
-          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel3);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel3, &device_param->kernel_wgs3);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel3, &device_param->kernel_local_mem_size3);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel3, &device_param->kernel_preferred_wgs_multiple3);
-
-          if (CL_rc == -1) return -1;
-        }
-        else
-        {
-          snprintf (kernel_name, sizeof (kernel_name), "m%05u_mxx", kern_type);
-
-          CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel4);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel4, &device_param->kernel_wgs4);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel4, &device_param->kernel_local_mem_size4);
-
-          if (CL_rc == -1) return -1;
-
-          CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel4, &device_param->kernel_preferred_wgs_multiple4);
-
-          if (CL_rc == -1) return -1;
-        }
-      }
-
-      if (user_options->slow_candidates == true)
-      {
-      }
-      else
-      {
-        if (user_options->attack_mode == ATTACK_MODE_BF)
-        {
-          if (hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE)
-          {
-            snprintf (kernel_name, sizeof (kernel_name), "m%05u_tm", kern_type);
-
-            CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel_tm);
-
-            if (CL_rc == -1) return -1;
-
-            CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_tm, &device_param->kernel_wgs_tm);
-
-            if (CL_rc == -1) return -1;
-
-            CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_tm, &device_param->kernel_local_mem_size_tm);
-
-            if (CL_rc == -1) return -1;
-
-            CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_tm, &device_param->kernel_preferred_wgs_multiple_tm);
-
-            if (CL_rc == -1) return -1;
-          }
-        }
-      }
-    }
-    else
-    {
-      // kernel1
-
-      snprintf (kernel_name, sizeof (kernel_name), "m%05u_init", kern_type);
-
-      CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel1);
-
-      if (CL_rc == -1) return -1;
-
-      CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel1, &device_param->kernel_wgs1);
-
-      if (CL_rc == -1) return -1;
-
-      CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel1, &device_param->kernel_local_mem_size1);
-
-      if (CL_rc == -1) return -1;
-
-      CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel1, &device_param->kernel_preferred_wgs_multiple1);
-
-      if (CL_rc == -1) return -1;
-
-      // kernel2
-
-      snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop", kern_type);
-
-      CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel2);
-
-      if (CL_rc == -1) return -1;
-
-      CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel2, &device_param->kernel_wgs2);
-
-      if (CL_rc == -1) return -1;
-
-      CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel2, &device_param->kernel_local_mem_size2);
-
-      if (CL_rc == -1) return -1;
-
-      CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel2, &device_param->kernel_preferred_wgs_multiple2);
-
-      if (CL_rc == -1) return -1;
-
-      // kernel3
-
-      snprintf (kernel_name, sizeof (kernel_name), "m%05u_comp", kern_type);
-
-      CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel3);
-
-      if (CL_rc == -1) return -1;
-
-      CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel3, &device_param->kernel_wgs3);
-
-      if (CL_rc == -1) return -1;
-
-      CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel3, &device_param->kernel_local_mem_size3);
-
-      if (CL_rc == -1) return -1;
-
-      CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel3, &device_param->kernel_preferred_wgs_multiple3);
-
-      if (CL_rc == -1) return -1;
-
-      // kernel12
-
-      if (hashconfig->opts_type & OPTS_TYPE_HOOK12)
-      {
-        snprintf (kernel_name, sizeof (kernel_name), "m%05u_hook12", kern_type);
-
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel12);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel12, &device_param->kernel_wgs12);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel12, &device_param->kernel_local_mem_size12);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel12, &device_param->kernel_preferred_wgs_multiple12);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      // kernel23
-
-      if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
-      {
-        snprintf (kernel_name, sizeof (kernel_name), "m%05u_hook23", kern_type);
-
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel23);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel23, &device_param->kernel_wgs23);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel23, &device_param->kernel_local_mem_size23);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel23, &device_param->kernel_preferred_wgs_multiple23);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      // init2
-
-      if (hashconfig->opts_type & OPTS_TYPE_INIT2)
-      {
-        snprintf (kernel_name, sizeof (kernel_name), "m%05u_init2", kern_type);
-
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel_init2);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_init2, &device_param->kernel_wgs_init2);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_init2, &device_param->kernel_local_mem_size_init2);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_init2, &device_param->kernel_preferred_wgs_multiple_init2);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      // loop2
-
-      if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
-      {
-        snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop2", kern_type);
-
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel_loop2);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_loop2, &device_param->kernel_wgs_loop2);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_loop2, &device_param->kernel_local_mem_size_loop2);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_loop2, &device_param->kernel_preferred_wgs_multiple_loop2);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      // aux1
-
-      if (hashconfig->opts_type & OPTS_TYPE_AUX1)
-      {
-        snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux1", kern_type);
-
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel_aux1);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_aux1, &device_param->kernel_wgs_aux1);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_aux1, &device_param->kernel_local_mem_size_aux1);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_aux1, &device_param->kernel_preferred_wgs_multiple_aux1);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      // aux2
-
-      if (hashconfig->opts_type & OPTS_TYPE_AUX2)
-      {
-        snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux2", kern_type);
-
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel_aux2);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_aux2, &device_param->kernel_wgs_aux2);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_aux2, &device_param->kernel_local_mem_size_aux2);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_aux2, &device_param->kernel_preferred_wgs_multiple_aux2);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      // aux3
-
-      if (hashconfig->opts_type & OPTS_TYPE_AUX3)
-      {
-        snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux3", kern_type);
-
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel_aux3);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_aux3, &device_param->kernel_wgs_aux3);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_aux3, &device_param->kernel_local_mem_size_aux3);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_aux3, &device_param->kernel_preferred_wgs_multiple_aux3);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      // aux4
-
-      if (hashconfig->opts_type & OPTS_TYPE_AUX4)
-      {
-        snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux4", kern_type);
-
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, kernel_name, &device_param->kernel_aux4);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_aux4, &device_param->kernel_wgs_aux4);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_aux4, &device_param->kernel_local_mem_size_aux4);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_aux4, &device_param->kernel_preferred_wgs_multiple_aux4);
-
-        if (CL_rc == -1) return -1;
-      }
-    }
-
-    // GPU memset
-
-    CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, "gpu_memset", &device_param->kernel_memset);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_memset, &device_param->kernel_wgs_memset);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_memset, &device_param->kernel_local_mem_size_memset);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_memset, &device_param->kernel_preferred_wgs_multiple_memset);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_memset, 0, sizeof (cl_mem),   device_param->kernel_params_memset[0]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_memset, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1;
-
-    // GPU autotune init
-
-    CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, "gpu_atinit", &device_param->kernel_atinit);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_atinit, &device_param->kernel_wgs_atinit);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_atinit, &device_param->kernel_local_mem_size_atinit);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_atinit, &device_param->kernel_preferred_wgs_multiple_atinit);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_atinit, 0, sizeof (cl_mem),   device_param->kernel_params_atinit[0]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]); if (CL_rc == -1) return -1;
-
-    // GPU decompress
-
-    CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, "gpu_decompress", &device_param->kernel_decompress);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_decompress, &device_param->kernel_wgs_decompress);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_decompress, &device_param->kernel_local_mem_size_decompress);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_decompress, &device_param->kernel_preferred_wgs_multiple_decompress);
-
-    if (CL_rc == -1) return -1;
-
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_decompress, 0, sizeof (cl_mem),   device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_decompress, 1, sizeof (cl_mem),   device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_decompress, 2, sizeof (cl_mem),   device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_decompress, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]); if (CL_rc == -1) return -1;
-
-    // MP start
-
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      if (user_options->attack_mode == ATTACK_MODE_BF)
-      {
-        // mp_l
-
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program_mp, "l_markov", &device_param->kernel_mp_l);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_mp_l, &device_param->kernel_wgs_mp_l);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_mp_l, &device_param->kernel_local_mem_size_mp_l);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_mp_l, &device_param->kernel_preferred_wgs_multiple_mp_l);
-
-        if (CL_rc == -1) return -1;
-
-        // mp_r
-
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program_mp, "r_markov", &device_param->kernel_mp_r);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_mp_r, &device_param->kernel_wgs_mp_r);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_mp_r, &device_param->kernel_local_mem_size_mp_r);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_mp_r, &device_param->kernel_preferred_wgs_multiple_mp_r);
-
-        if (CL_rc == -1) return -1;
-
-        if (hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE)
-        {
-          CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_tm, 0, sizeof (cl_mem), device_param->kernel_params_tm[0]); if (CL_rc == -1) return -1;
-          CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_tm, 1, sizeof (cl_mem), device_param->kernel_params_tm[1]); if (CL_rc == -1) return -1;
-        }
-      }
-      else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
-      {
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program_mp, "C_markov", &device_param->kernel_mp);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_mp, &device_param->kernel_wgs_mp);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_mp, &device_param->kernel_local_mem_size_mp);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_mp, &device_param->kernel_preferred_wgs_multiple_mp);
-
-        if (CL_rc == -1) return -1;
-      }
-      else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
-      {
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program_mp, "C_markov", &device_param->kernel_mp);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_mp, &device_param->kernel_wgs_mp);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_mp, &device_param->kernel_local_mem_size_mp);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_mp, &device_param->kernel_preferred_wgs_multiple_mp);
-
-        if (CL_rc == -1) return -1;
-      }
-    }
-
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-      {
-        // nothing to do
-      }
-      else
-      {
-        CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program_amp, "amp", &device_param->kernel_amp);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_wgs (hashcat_ctx, device_param, device_param->kernel_amp, &device_param->kernel_wgs_amp);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_local_mem_size (hashcat_ctx, device_param, device_param->kernel_amp, &device_param->kernel_local_mem_size_amp);
-
-        if (CL_rc == -1) return -1;
-
-        CL_rc = get_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->kernel_amp, &device_param->kernel_preferred_wgs_multiple_amp);
-
-        if (CL_rc == -1) return -1;
-      }
-
-      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-      {
-        // nothing to do
-      }
-      else
-      {
-        for (u32 i = 0; i < 5; i++)
-        {
-          CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_amp, i, sizeof (cl_mem), device_param->kernel_params_amp[i]);
-
-          if (CL_rc == -1) return -1;
-        }
-
-        for (u32 i = 5; i < 6; i++)
-        {
-          CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_amp, i, sizeof (cl_uint), device_param->kernel_params_amp[i]);
-
-          if (CL_rc == -1) return -1;
-        }
-
-        for (u32 i = 6; i < 7; i++)
-        {
-          CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_amp, i, sizeof (cl_ulong), device_param->kernel_params_amp[i]);
-
-          if (CL_rc == -1) return -1;
-        }
-      }
-    }
-
-    // zero some data buffers
-
-    CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_plain_bufs,    device_param->size_plains);   if (CL_rc == -1) return -1;
-    CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_digests_shown, device_param->size_shown);    if (CL_rc == -1) return -1;
-    CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_result,        device_param->size_results);  if (CL_rc == -1) return -1;
-
-    /**
-     * special buffers
-     */
-
-    if (user_options->slow_candidates == true)
-    {
-      CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_rules_c, size_rules_c); if (CL_rc == -1) return -1;
-    }
-    else
-    {
-      if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-      {
-        CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_rules_c, size_rules_c); if (CL_rc == -1) return -1;
-      }
-      else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
-      {
-        CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_combs,          size_combs);       if (CL_rc == -1) return -1;
-        CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_combs_c,        size_combs);       if (CL_rc == -1) return -1;
-        CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_root_css_buf,   size_root_css);    if (CL_rc == -1) return -1;
-        CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_markov_css_buf, size_markov_css);  if (CL_rc == -1) return -1;
-      }
-      else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
-      {
-        CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_bfs,            size_bfs);         if (CL_rc == -1) return -1;
-        CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_bfs_c,          size_bfs);         if (CL_rc == -1) return -1;
-        CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_tm_c,           size_tm);          if (CL_rc == -1) return -1;
-        CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_root_css_buf,   size_root_css);    if (CL_rc == -1) return -1;
-        CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_markov_css_buf, size_markov_css);  if (CL_rc == -1) return -1;
-      }
-    }
-
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      if ((user_options->attack_mode == ATTACK_MODE_HYBRID1) || (user_options->attack_mode == ATTACK_MODE_HYBRID2))
-      {
-        /**
-         * prepare mp
-         */
-
-        if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
-        {
-          device_param->kernel_params_mp_buf32[5] = 0;
-          device_param->kernel_params_mp_buf32[6] = 0;
-          device_param->kernel_params_mp_buf32[7] = 0;
-
-          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)     device_param->kernel_params_mp_buf32[5] = full01;
-          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)     device_param->kernel_params_mp_buf32[5] = full06;
-          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)     device_param->kernel_params_mp_buf32[5] = full80;
-          if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS14) device_param->kernel_params_mp_buf32[6] = 1;
-          if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS15) device_param->kernel_params_mp_buf32[7] = 1;
-        }
-        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
-        {
-          device_param->kernel_params_mp_buf32[5] = 0;
-          device_param->kernel_params_mp_buf32[6] = 0;
-          device_param->kernel_params_mp_buf32[7] = 0;
-        }
-
-        for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp, i, sizeof (cl_mem), device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
-      }
-      else if (user_options->attack_mode == ATTACK_MODE_BF)
-      {
-        /**
-         * prepare mp_r and mp_l
-         */
-
-        device_param->kernel_params_mp_l_buf32[6] = 0;
-        device_param->kernel_params_mp_l_buf32[7] = 0;
-        device_param->kernel_params_mp_l_buf32[8] = 0;
-
-        if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)     device_param->kernel_params_mp_l_buf32[6] = full01;
-        if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)     device_param->kernel_params_mp_l_buf32[6] = full06;
-        if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)     device_param->kernel_params_mp_l_buf32[6] = full80;
-        if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS14) device_param->kernel_params_mp_l_buf32[7] = 1;
-        if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS15) device_param->kernel_params_mp_l_buf32[8] = 1;
-
-        for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp_l, i, sizeof (cl_mem), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
-        for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp_r, i, sizeof (cl_mem), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
-      }
-    }
-
-    /**
-     * now everything that depends on threads and accel, basically dynamic workload
-     */
-
-    u32 kernel_threads = get_kernel_threads (hashcat_ctx, device_param);
-
-    // this is required because inside the kernels there is this:
-    // __local pw_t s_pws[64];
-
-    if (user_options->attack_mode == ATTACK_MODE_STRAIGHT)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-      {
-        // not required
-      }
-      else
-      {
-        kernel_threads = MIN (kernel_threads, 64);
-      }
-    }
-
-    device_param->kernel_threads = kernel_threads;
-
-    device_param->hardware_power = device_processors * kernel_threads;
-
-    u32 kernel_accel_min = device_param->kernel_accel_min;
-    u32 kernel_accel_max = device_param->kernel_accel_max;
-
-    // find out if we would request too much memory on memory blocks which are based on kernel_accel
-
-    u64 size_pws      = 4;
-    u64 size_pws_amp  = 4;
-    u64 size_pws_comp = 4;
-    u64 size_pws_idx  = 4;
-    u64 size_pws_pre  = 4;
-    u64 size_pws_base = 4;
-    u64 size_tmps     = 4;
-    u64 size_hooks    = 4;
-    #ifdef WITH_BRAIN
-    u64 size_brain_link_in  = 4;
-    u64 size_brain_link_out = 4;
-    #endif
-
-    // instead of a thread limit we can also use a memory limit.
-    // this value should represent a reasonable amount of memory a host system has per GPU.
-    // note we're allocating 3 blocks of that size.
-
-    #define PWS_SPACE (1024 * 1024 * 1024)
-
-    // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-    // let's add some extra space just to be sure.
-
-    #define EXTRA_SPACE (64 * 1024 * 1024)
-
-    while (kernel_accel_max >= kernel_accel_min)
-    {
-      const u64 kernel_power_max = device_param->hardware_power * kernel_accel_max;
-
-      // size_pws
-
-      size_pws = (u64) kernel_power_max * sizeof (pw_t);
-
-      size_pws_amp = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) ? 1 : size_pws;
-
-      // size_pws_comp
-
-      size_pws_comp = (u64) kernel_power_max * (sizeof (u32) * 64);
-
-      // size_pws_idx
-
-      size_pws_idx = (u64) (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-      // size_tmps
-
-      size_tmps = (u64) kernel_power_max * (hashconfig->tmp_size + hashconfig->extra_tmp_size);
-
-      // size_hooks
-
-      size_hooks = (u64) kernel_power_max * hashconfig->hook_size;
-
-      #ifdef WITH_BRAIN
-      // size_brains
-
-      size_brain_link_in  = (u64) kernel_power_max * 1;
-      size_brain_link_out = (u64) kernel_power_max * 8;
-      #endif
-
-      if (user_options->slow_candidates == true)
-      {
-        // size_pws_pre
-
-        size_pws_pre = (u64) kernel_power_max * sizeof (pw_pre_t);
-
-        // size_pws_base
-
-        size_pws_base = (u64) kernel_power_max * sizeof (pw_pre_t);
-      }
-
-      // now check if all device-memory sizes which depend on the kernel_accel_max amplifier are within its boundaries
-      // if not, decrease amplifier and try again
-
-      int memory_limit_hit = 0;
-
-      if (size_pws > PWS_SPACE) memory_limit_hit = 1;
-
-      if ((size_pws   + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
-      if ((size_tmps  + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
-      if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
-
-      const u64 size_total
-        = bitmap_ctx->bitmap_size
-        + bitmap_ctx->bitmap_size
-        + bitmap_ctx->bitmap_size
-        + bitmap_ctx->bitmap_size
-        + bitmap_ctx->bitmap_size
-        + bitmap_ctx->bitmap_size
-        + bitmap_ctx->bitmap_size
-        + bitmap_ctx->bitmap_size
-        + size_bfs
-        + size_combs
-        + size_digests
-        + size_esalts
-        + size_hooks
-        + size_markov_css
-        + size_plains
-        + size_pws
-        + size_pws_amp
-        + size_pws_comp
-        + size_pws_idx
-        + size_results
-        + size_root_css
-        + size_rules
-        + size_rules_c
-        + size_salts
-        + size_extra_buffer
-        + size_shown
-        + size_tm
-        + size_tmps
-        + size_st_digests
-        + size_st_salts
-        + size_st_esalts;
-
-      if ((size_total + EXTRA_SPACE) > device_param->device_available_mem) memory_limit_hit = 1;
-
-      const u64 size_total_host
-        = size_pws_comp
-        + size_pws_idx
-        + size_hooks
-        #ifdef WITH_BRAIN
-        + size_brain_link_in
-        + size_brain_link_out
-        #endif
-        + size_pws_pre
-        + size_pws_base;
-
-      if ((size_total_host + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
-
-      #if defined (__x86_x64__)
-      const u64 MAX_HOST_MEMORY = 16ull * 1024ull * 1024ull * 1024ull; // don't be too memory hungry
-      #else
-      const u64 MAX_HOST_MEMORY =  2ull * 1024ull * 1024ull * 1024ull; // windows 7 starter limits to 2gb instead of 4gb
-      #endif
-
-      // we assume all devices have the same specs here, which is wrong, it's a start
-      if ((size_total_host * opencl_ctx->devices_cnt) > MAX_HOST_MEMORY) memory_limit_hit = 1;
-
-      if (memory_limit_hit == 1)
-      {
-        kernel_accel_max--;
-
-        continue;
-      }
-
-      break;
-    }
-
-    if (kernel_accel_max < kernel_accel_min)
-    {
-      event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory for this attack.", device_id + 1);
-
-      return -1;
-    }
-
-    device_param->kernel_accel_min = kernel_accel_min;
-    device_param->kernel_accel_max = kernel_accel_max;
-
-    device_param->size_pws      = size_pws;
-    device_param->size_pws_amp  = size_pws_amp;
-    device_param->size_pws_comp = size_pws_comp;
-    device_param->size_pws_idx  = size_pws_idx;
-    device_param->size_pws_pre  = size_pws_pre;
-    device_param->size_pws_base = size_pws_base;
-    device_param->size_tmps     = size_tmps;
-    device_param->size_hooks    = size_hooks;
-    #ifdef WITH_BRAIN
-    device_param->size_brain_link_in  = size_brain_link_in;
-    device_param->size_brain_link_out = size_brain_link_out;
-    #endif
-
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE,  size_pws,      NULL, &device_param->d_pws_buf);      if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE,  size_pws_amp,  NULL, &device_param->d_pws_amp_buf);  if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   size_pws_comp, NULL, &device_param->d_pws_comp_buf); if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY,   size_pws_idx,  NULL, &device_param->d_pws_idx);      if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE,  size_tmps,     NULL, &device_param->d_tmps);         if (CL_rc == -1) return -1;
-    CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE,  size_hooks,    NULL, &device_param->d_hooks);        if (CL_rc == -1) return -1;
-
-    CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_pws_buf,       device_param->size_pws);      if (CL_rc == -1) return -1;
-    CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_pws_amp_buf,   device_param->size_pws_amp);  if (CL_rc == -1) return -1;
-    CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_pws_comp_buf,  device_param->size_pws_comp); if (CL_rc == -1) return -1;
-    CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_pws_idx,       device_param->size_pws_idx);  if (CL_rc == -1) return -1;
-    CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_tmps,          device_param->size_tmps);     if (CL_rc == -1) return -1;
-    CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_hooks,         device_param->size_hooks);    if (CL_rc == -1) return -1;
-
-    /**
-     * main host data
-     */
-
-    u32 *pws_comp = (u32 *) hcmalloc (size_pws_comp);
-
-    device_param->pws_comp = pws_comp;
-
-    pw_idx_t *pws_idx = (pw_idx_t *) hcmalloc (size_pws_idx);
-
-    device_param->pws_idx = pws_idx;
-
-    pw_t *combs_buf = (pw_t *) hccalloc (KERNEL_COMBS, sizeof (pw_t));
-
-    device_param->combs_buf = combs_buf;
-
-    void *hooks_buf = hcmalloc (size_hooks);
-
-    device_param->hooks_buf = hooks_buf;
-
-    char *scratch_buf = (char *) hcmalloc (HCBUFSIZ_LARGE);
-
-    device_param->scratch_buf = scratch_buf;
-
-    #ifdef WITH_BRAIN
-
-    u8 *brain_link_in_buf = (u8 *) hcmalloc (size_brain_link_in);
-
-    device_param->brain_link_in_buf = brain_link_in_buf;
-
-    u32 *brain_link_out_buf = (u32 *) hcmalloc (size_brain_link_out);
-
-    device_param->brain_link_out_buf = brain_link_out_buf;
-    #endif
-
-    pw_pre_t *pws_pre_buf = (pw_pre_t *) hcmalloc (size_pws_pre);
-
-    device_param->pws_pre_buf = pws_pre_buf;
-
-    pw_pre_t *pws_base_buf = (pw_pre_t *) hcmalloc (size_pws_base);
-
-    device_param->pws_base_buf = pws_base_buf;
-
-    /**
-     * kernel args
-     */
-
-    device_param->kernel_params[ 0] = &device_param->d_pws_buf;
-    device_param->kernel_params[ 4] = &device_param->d_tmps;
-    device_param->kernel_params[ 5] = &device_param->d_hooks;
-
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-      {
-        // nothing to do
-      }
-      else
-      {
-        if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
-        {
-          device_param->kernel_params_mp[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-                                            ? &device_param->d_pws_buf
-                                            : &device_param->d_pws_amp_buf;
-
-          CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp, 0, sizeof (cl_mem), device_param->kernel_params_mp[0]); if (CL_rc == -1) return -1;
-        }
-      }
-
-      if (user_options->attack_mode == ATTACK_MODE_BF)
-      {
-        device_param->kernel_params_mp_l[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-                                            ? &device_param->d_pws_buf
-                                            : &device_param->d_pws_amp_buf;
-
-        CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp_l, 0, sizeof (cl_mem), device_param->kernel_params_mp_l[0]); if (CL_rc == -1) return -1;
-      }
-
-      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-      {
-        // nothing to do
-      }
-      else
-      {
-        device_param->kernel_params_amp[0] = &device_param->d_pws_buf;
-        device_param->kernel_params_amp[1] = &device_param->d_pws_amp_buf;
-
-        CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_amp, 0, sizeof (cl_mem), device_param->kernel_params_amp[0]); if (CL_rc == -1) return -1;
-        CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_amp, 1, sizeof (cl_mem), device_param->kernel_params_amp[1]); if (CL_rc == -1) return -1;
-      }
-    }
-
-    device_param->kernel_params_decompress[0] = &device_param->d_pws_idx;
-    device_param->kernel_params_decompress[1] = &device_param->d_pws_comp_buf;
-    device_param->kernel_params_decompress[2] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-                                              ? &device_param->d_pws_buf
-                                              : &device_param->d_pws_amp_buf;
-
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_decompress, 1, sizeof (cl_mem), device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1;
-
-    hardware_power_all += device_param->hardware_power;
-
-    EVENT_DATA (EVENT_OPENCL_DEVICE_INIT_POST, &device_id, sizeof (u32));
-  }
-
-  if (user_options->benchmark == false)
-  {
-    if (hardware_power_all == 0) return -1;
-  }
-
-  opencl_ctx->hardware_power_all = hardware_power_all;
-
-  return 0;
-}
-
-void opencl_session_destroy (hashcat_ctx_t *hashcat_ctx)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  if (opencl_ctx->enabled == false) return;
-
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
-  {
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
-
-    if (device_param->skipped == true) continue;
-
-    hcfree (device_param->pws_comp);
-    hcfree (device_param->pws_idx);
-    hcfree (device_param->pws_pre_buf);
-    hcfree (device_param->pws_base_buf);
-    hcfree (device_param->combs_buf);
-    hcfree (device_param->hooks_buf);
-    hcfree (device_param->scratch_buf);
-    #ifdef WITH_BRAIN
-    hcfree (device_param->brain_link_in_buf);
-    hcfree (device_param->brain_link_out_buf);
-    #endif
-
-    if (device_param->d_pws_buf)        hc_clReleaseMemObject (hashcat_ctx, device_param->d_pws_buf);
-    if (device_param->d_pws_amp_buf)    hc_clReleaseMemObject (hashcat_ctx, device_param->d_pws_amp_buf);
-    if (device_param->d_pws_comp_buf)   hc_clReleaseMemObject (hashcat_ctx, device_param->d_pws_comp_buf);
-    if (device_param->d_pws_idx)        hc_clReleaseMemObject (hashcat_ctx, device_param->d_pws_idx);
-    if (device_param->d_rules)          hc_clReleaseMemObject (hashcat_ctx, device_param->d_rules);
-    if (device_param->d_rules_c)        hc_clReleaseMemObject (hashcat_ctx, device_param->d_rules_c);
-    if (device_param->d_combs)          hc_clReleaseMemObject (hashcat_ctx, device_param->d_combs);
-    if (device_param->d_combs_c)        hc_clReleaseMemObject (hashcat_ctx, device_param->d_combs_c);
-    if (device_param->d_bfs)            hc_clReleaseMemObject (hashcat_ctx, device_param->d_bfs);
-    if (device_param->d_bfs_c)          hc_clReleaseMemObject (hashcat_ctx, device_param->d_bfs_c);
-    if (device_param->d_bitmap_s1_a)    hc_clReleaseMemObject (hashcat_ctx, device_param->d_bitmap_s1_a);
-    if (device_param->d_bitmap_s1_b)    hc_clReleaseMemObject (hashcat_ctx, device_param->d_bitmap_s1_b);
-    if (device_param->d_bitmap_s1_c)    hc_clReleaseMemObject (hashcat_ctx, device_param->d_bitmap_s1_c);
-    if (device_param->d_bitmap_s1_d)    hc_clReleaseMemObject (hashcat_ctx, device_param->d_bitmap_s1_d);
-    if (device_param->d_bitmap_s2_a)    hc_clReleaseMemObject (hashcat_ctx, device_param->d_bitmap_s2_a);
-    if (device_param->d_bitmap_s2_b)    hc_clReleaseMemObject (hashcat_ctx, device_param->d_bitmap_s2_b);
-    if (device_param->d_bitmap_s2_c)    hc_clReleaseMemObject (hashcat_ctx, device_param->d_bitmap_s2_c);
-    if (device_param->d_bitmap_s2_d)    hc_clReleaseMemObject (hashcat_ctx, device_param->d_bitmap_s2_d);
-    if (device_param->d_plain_bufs)     hc_clReleaseMemObject (hashcat_ctx, device_param->d_plain_bufs);
-    if (device_param->d_digests_buf)    hc_clReleaseMemObject (hashcat_ctx, device_param->d_digests_buf);
-    if (device_param->d_digests_shown)  hc_clReleaseMemObject (hashcat_ctx, device_param->d_digests_shown);
-    if (device_param->d_salt_bufs)      hc_clReleaseMemObject (hashcat_ctx, device_param->d_salt_bufs);
-    if (device_param->d_esalt_bufs)     hc_clReleaseMemObject (hashcat_ctx, device_param->d_esalt_bufs);
-    if (device_param->d_tmps)           hc_clReleaseMemObject (hashcat_ctx, device_param->d_tmps);
-    if (device_param->d_hooks)          hc_clReleaseMemObject (hashcat_ctx, device_param->d_hooks);
-    if (device_param->d_result)         hc_clReleaseMemObject (hashcat_ctx, device_param->d_result);
-    if (device_param->d_extra0_buf)     hc_clReleaseMemObject (hashcat_ctx, device_param->d_extra0_buf);
-    if (device_param->d_extra1_buf)     hc_clReleaseMemObject (hashcat_ctx, device_param->d_extra1_buf);
-    if (device_param->d_extra2_buf)     hc_clReleaseMemObject (hashcat_ctx, device_param->d_extra2_buf);
-    if (device_param->d_extra3_buf)     hc_clReleaseMemObject (hashcat_ctx, device_param->d_extra3_buf);
-    if (device_param->d_root_css_buf)   hc_clReleaseMemObject (hashcat_ctx, device_param->d_root_css_buf);
-    if (device_param->d_markov_css_buf) hc_clReleaseMemObject (hashcat_ctx, device_param->d_markov_css_buf);
-    if (device_param->d_tm_c)           hc_clReleaseMemObject (hashcat_ctx, device_param->d_tm_c);
-    if (device_param->d_st_digests_buf) hc_clReleaseMemObject (hashcat_ctx, device_param->d_st_digests_buf);
-    if (device_param->d_st_salts_buf)   hc_clReleaseMemObject (hashcat_ctx, device_param->d_st_salts_buf);
-    if (device_param->d_st_esalts_buf)  hc_clReleaseMemObject (hashcat_ctx, device_param->d_st_esalts_buf);
-
-    if (device_param->kernel1)          hc_clReleaseKernel (hashcat_ctx, device_param->kernel1);
-    if (device_param->kernel12)         hc_clReleaseKernel (hashcat_ctx, device_param->kernel12);
-    if (device_param->kernel2)          hc_clReleaseKernel (hashcat_ctx, device_param->kernel2);
-    if (device_param->kernel23)         hc_clReleaseKernel (hashcat_ctx, device_param->kernel23);
-    if (device_param->kernel3)          hc_clReleaseKernel (hashcat_ctx, device_param->kernel3);
-    if (device_param->kernel4)          hc_clReleaseKernel (hashcat_ctx, device_param->kernel4);
-    if (device_param->kernel_init2)     hc_clReleaseKernel (hashcat_ctx, device_param->kernel_init2);
-    if (device_param->kernel_loop2)     hc_clReleaseKernel (hashcat_ctx, device_param->kernel_loop2);
-    if (device_param->kernel_mp)        hc_clReleaseKernel (hashcat_ctx, device_param->kernel_mp);
-    if (device_param->kernel_mp_l)      hc_clReleaseKernel (hashcat_ctx, device_param->kernel_mp_l);
-    if (device_param->kernel_mp_r)      hc_clReleaseKernel (hashcat_ctx, device_param->kernel_mp_r);
-    if (device_param->kernel_tm)        hc_clReleaseKernel (hashcat_ctx, device_param->kernel_tm);
-    if (device_param->kernel_amp)       hc_clReleaseKernel (hashcat_ctx, device_param->kernel_amp);
-    if (device_param->kernel_memset)    hc_clReleaseKernel (hashcat_ctx, device_param->kernel_memset);
-    if (device_param->kernel_atinit)    hc_clReleaseKernel (hashcat_ctx, device_param->kernel_atinit);
-    if (device_param->kernel_decompress)hc_clReleaseKernel (hashcat_ctx, device_param->kernel_decompress);
-    if (device_param->kernel_aux1)      hc_clReleaseKernel (hashcat_ctx, device_param->kernel_aux1);
-    if (device_param->kernel_aux2)      hc_clReleaseKernel (hashcat_ctx, device_param->kernel_aux2);
-    if (device_param->kernel_aux3)      hc_clReleaseKernel (hashcat_ctx, device_param->kernel_aux3);
-    if (device_param->kernel_aux4)      hc_clReleaseKernel (hashcat_ctx, device_param->kernel_aux4);
-
-    if (device_param->program)          hc_clReleaseProgram (hashcat_ctx, device_param->program);
-    if (device_param->program_mp)       hc_clReleaseProgram (hashcat_ctx, device_param->program_mp);
-    if (device_param->program_amp)      hc_clReleaseProgram (hashcat_ctx, device_param->program_amp);
-
-    if (device_param->command_queue)    hc_clReleaseCommandQueue (hashcat_ctx, device_param->command_queue);
-
-    if (device_param->context)          hc_clReleaseContext (hashcat_ctx, device_param->context);
-
-    device_param->pws_comp            = NULL;
-    device_param->pws_idx             = NULL;
-    device_param->pws_pre_buf         = NULL;
-    device_param->pws_base_buf        = NULL;
-    device_param->combs_buf           = NULL;
-    device_param->hooks_buf           = NULL;
-    device_param->scratch_buf         = NULL;
-    #ifdef WITH_BRAIN
-    device_param->brain_link_in_buf   = NULL;
-    device_param->brain_link_out_buf  = NULL;
-    #endif
-
-    device_param->d_pws_buf           = NULL;
-    device_param->d_pws_amp_buf       = NULL;
-    device_param->d_pws_comp_buf      = NULL;
-    device_param->d_pws_idx           = NULL;
-    device_param->d_rules             = NULL;
-    device_param->d_rules_c           = NULL;
-    device_param->d_combs             = NULL;
-    device_param->d_combs_c           = NULL;
-    device_param->d_bfs               = NULL;
-    device_param->d_bfs_c             = NULL;
-    device_param->d_bitmap_s1_a       = NULL;
-    device_param->d_bitmap_s1_b       = NULL;
-    device_param->d_bitmap_s1_c       = NULL;
-    device_param->d_bitmap_s1_d       = NULL;
-    device_param->d_bitmap_s2_a       = NULL;
-    device_param->d_bitmap_s2_b       = NULL;
-    device_param->d_bitmap_s2_c       = NULL;
-    device_param->d_bitmap_s2_d       = NULL;
-    device_param->d_plain_bufs        = NULL;
-    device_param->d_digests_buf       = NULL;
-    device_param->d_digests_shown     = NULL;
-    device_param->d_salt_bufs         = NULL;
-    device_param->d_esalt_bufs        = NULL;
-    device_param->d_tmps              = NULL;
-    device_param->d_hooks             = NULL;
-    device_param->d_result            = NULL;
-    device_param->d_extra0_buf        = NULL;
-    device_param->d_extra1_buf        = NULL;
-    device_param->d_extra2_buf        = NULL;
-    device_param->d_extra3_buf        = NULL;
-    device_param->d_root_css_buf      = NULL;
-    device_param->d_markov_css_buf    = NULL;
-    device_param->d_tm_c              = NULL;
-    device_param->d_st_digests_buf    = NULL;
-    device_param->d_st_salts_buf      = NULL;
-    device_param->d_st_esalts_buf     = NULL;
-    device_param->kernel1             = NULL;
-    device_param->kernel12            = NULL;
-    device_param->kernel2             = NULL;
-    device_param->kernel23            = NULL;
-    device_param->kernel3             = NULL;
-    device_param->kernel4             = NULL;
-    device_param->kernel_init2        = NULL;
-    device_param->kernel_loop2        = NULL;
-    device_param->kernel_mp           = NULL;
-    device_param->kernel_mp_l         = NULL;
-    device_param->kernel_mp_r         = NULL;
-    device_param->kernel_tm           = NULL;
-    device_param->kernel_amp          = NULL;
-    device_param->kernel_memset       = NULL;
-    device_param->kernel_atinit       = NULL;
-    device_param->kernel_decompress   = NULL;
-    device_param->kernel_aux1         = NULL;
-    device_param->kernel_aux2         = NULL;
-    device_param->kernel_aux3         = NULL;
-    device_param->kernel_aux4         = NULL;
-    device_param->program             = NULL;
-    device_param->program_mp          = NULL;
-    device_param->program_amp         = NULL;
-    device_param->command_queue       = NULL;
-    device_param->context             = NULL;
-  }
-}
-
-void opencl_session_reset (hashcat_ctx_t *hashcat_ctx)
-{
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
-
-  if (opencl_ctx->enabled == false) return;
-
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
-  {
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
-
-    if (device_param->skipped == true) continue;
-
-    device_param->speed_pos = 0;
-
-    memset (device_param->speed_cnt,  0, SPEED_CACHE * sizeof (u64));
-    memset (device_param->speed_msec, 0, SPEED_CACHE * sizeof (double));
-
-    device_param->speed_only_finish = false;
-
-    device_param->exec_pos = 0;
-
-    memset (device_param->exec_msec, 0, EXEC_CACHE * sizeof (double));
-
-    device_param->outerloop_msec = 0;
-    device_param->outerloop_pos  = 0;
-    device_param->outerloop_left = 0;
-    device_param->innerloop_pos  = 0;
-    device_param->innerloop_left = 0;
-
-    // some more resets:
-
-    if (device_param->pws_comp) memset (device_param->pws_comp, 0, device_param->size_pws_comp);
-    if (device_param->pws_idx)  memset (device_param->pws_idx,  0, device_param->size_pws_idx);
-
-    device_param->pws_cnt = 0;
-
-    device_param->words_off  = 0;
-    device_param->words_done = 0;
-
-    #if defined (_WIN)
-    device_param->timer_speed.QuadPart = 0;
-    #else
-    device_param->timer_speed.tv_sec = 0;
-    #endif
-  }
-
-  opencl_ctx->kernel_power_all   = 0;
-  opencl_ctx->kernel_power_final = 0;
-}
-
-int opencl_session_update_combinator (hashcat_ctx_t *hashcat_ctx)
-{
-  combinator_ctx_t *combinator_ctx = hashcat_ctx->combinator_ctx;
-  hashconfig_t     *hashconfig     = hashcat_ctx->hashconfig;
-  opencl_ctx_t     *opencl_ctx     = hashcat_ctx->opencl_ctx;
-  user_options_t   *user_options   = hashcat_ctx->user_options;
-
-  if (opencl_ctx->enabled == false) return 0;
-
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
-  {
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
-
-    if (device_param->skipped == true) continue;
-
-    if (device_param->skipped_warning == true) continue;
-
-    // kernel_params
-
-    device_param->kernel_params_buf32[33] = combinator_ctx->combs_mode;
-
-    /*
-    int CL_rc;
-
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel1, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel2, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel3, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1;
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel4, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1;
-
-    if (hashconfig->opts_type & OPTS_TYPE_HOOK12) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel12,     33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1; }
-    if (hashconfig->opts_type & OPTS_TYPE_HOOK23) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel23,     33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1; }
-    if (hashconfig->opts_type & OPTS_TYPE_INIT2)  { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_init2, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1; }
-    if (hashconfig->opts_type & OPTS_TYPE_LOOP2)  { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_loop2, 33, sizeof (cl_uint), device_param->kernel_params[33]); if (CL_rc == -1) return -1; }
-    */
-
-    // kernel_params_amp
-
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      device_param->kernel_params_amp_buf32[5] = combinator_ctx->combs_mode;
-
-      if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
-      {
-        int CL_rc;
-
-        CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_amp, 5, sizeof (cl_uint), device_param->kernel_params_amp[5]);
-
-        if (CL_rc == -1) return -1;
-      }
-    }
-  }
-
-  return 0;
-}
-
-int opencl_session_update_mp (hashcat_ctx_t *hashcat_ctx)
-{
-  mask_ctx_t     *mask_ctx     = hashcat_ctx->mask_ctx;
-  opencl_ctx_t   *opencl_ctx   = hashcat_ctx->opencl_ctx;
-  user_options_t *user_options = hashcat_ctx->user_options;
-
-  if (opencl_ctx->enabled == false) return 0;
-
-  if (user_options->slow_candidates == true) return 0;
-
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
-  {
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
-
-    if (device_param->skipped == true) continue;
-
-    if (device_param->skipped_warning == true) continue;
-
-    device_param->kernel_params_mp_buf64[3] = 0;
-    device_param->kernel_params_mp_buf32[4] = mask_ctx->css_cnt;
-
-    int CL_rc = CL_SUCCESS;
-
-    for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp, i, sizeof (cl_ulong), device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
-    for (u32 i = 4; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp, i, sizeof (cl_uint), device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
-
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_root_css_buf,   CL_TRUE, 0, device_param->size_root_css,   mask_ctx->root_css_buf,   0, NULL, NULL); if (CL_rc == -1) return -1;
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_markov_css_buf, CL_TRUE, 0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL); if (CL_rc == -1) return -1;
-  }
-
-  return 0;
-}
-
-int opencl_session_update_mp_rl (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_l, const u32 css_cnt_r)
-{
-  mask_ctx_t     *mask_ctx     = hashcat_ctx->mask_ctx;
-  opencl_ctx_t   *opencl_ctx   = hashcat_ctx->opencl_ctx;
-  user_options_t *user_options = hashcat_ctx->user_options;
-
-  if (opencl_ctx->enabled == false) return 0;
-
-  if (user_options->slow_candidates == true) return 0;
-
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
-  {
-    hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
-
-    if (device_param->skipped == true) continue;
-
-    if (device_param->skipped_warning == true) continue;
-
-    device_param->kernel_params_mp_l_buf64[3] = 0;
-    device_param->kernel_params_mp_l_buf32[4] = css_cnt_l;
-    device_param->kernel_params_mp_l_buf32[5] = css_cnt_r;
-
-    device_param->kernel_params_mp_r_buf64[3] = 0;
-    device_param->kernel_params_mp_r_buf32[4] = css_cnt_r;
-
-    int CL_rc = CL_SUCCESS;
-
-    for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp_l, i, sizeof (cl_ulong), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
-    for (u32 i = 4; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp_l, i, sizeof (cl_uint),  device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
-    for (u32 i = 9; i < 9; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp_l, i, sizeof (cl_ulong), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
-
-    for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp_r, i, sizeof (cl_ulong), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
-    for (u32 i = 4; i < 7; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp_r, i, sizeof (cl_uint),  device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
-    for (u32 i = 8; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_mp_r, i, sizeof (cl_ulong), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
-
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_root_css_buf,   CL_TRUE, 0, device_param->size_root_css,   mask_ctx->root_css_buf,   0, NULL, NULL); if (CL_rc == -1) return -1;
-    CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_markov_css_buf, CL_TRUE, 0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL); if (CL_rc == -1) return -1;
-  }
-
-  return 0;
-}
diff --git a/src/outfile.c b/src/outfile.c
index a7abbce00..06e5c80ed 100644
--- a/src/outfile.c
+++ b/src/outfile.c
@@ -11,7 +11,7 @@
 #include "rp.h"
 #include "emu_inc_rp.h"
 #include "emu_inc_rp_optimized.h"
-#include "opencl.h"
+#include "backend.h"
 #include "shared.h"
 #include "locking.h"
 #include "outfile.h"
diff --git a/src/outfile_check.c b/src/outfile_check.c
index 4a63e77c8..db15049c0 100644
--- a/src/outfile_check.c
+++ b/src/outfile_check.c
@@ -363,7 +363,7 @@ int outcheck_ctx_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->example_hashes == true) return 0;
   if (user_options->speed_only     == true) return 0;
   if (user_options->progress_only  == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
 
   if (hashconfig->outfile_check_disable == true) return 0;
 
diff --git a/src/potfile.c b/src/potfile.c
index b3015e04e..9516d4ae5 100644
--- a/src/potfile.c
+++ b/src/potfile.c
@@ -103,7 +103,7 @@ int potfile_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->benchmark       == true) return 0;
   if (user_options->example_hashes  == true) return 0;
   if (user_options->keyspace        == true) return 0;
-  if (user_options->opencl_info     == true) return 0;
+  if (user_options->backend_info    == true) return 0;
   if (user_options->stdout_flag     == true) return 0;
   if (user_options->speed_only      == true) return 0;
   if (user_options->progress_only   == true) return 0;
diff --git a/src/restore.c b/src/restore.c
index 042555433..8f61cc6ae 100644
--- a/src/restore.c
+++ b/src/restore.c
@@ -304,7 +304,7 @@ int restore_ctx_init (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
   if (user_options->example_hashes  == true) return 0;
   if (user_options->keyspace        == true) return 0;
   if (user_options->left            == true) return 0;
-  if (user_options->opencl_info     == true) return 0;
+  if (user_options->backend_info    == true) return 0;
   if (user_options->show            == true) return 0;
   if (user_options->stdout_flag     == true) return 0;
   if (user_options->speed_only      == true) return 0;
diff --git a/src/selftest.c b/src/selftest.c
index 89a527298..144b71611 100644
--- a/src/selftest.c
+++ b/src/selftest.c
@@ -8,7 +8,7 @@
 #include "event.h"
 #include "bitops.h"
 #include "convert.h"
-#include "opencl.h"
+#include "backend.h"
 #include "thread.h"
 #include "selftest.h"
 
@@ -21,17 +21,26 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
   user_options_t       *user_options       = hashcat_ctx->user_options;
   user_options_extra_t *user_options_extra = hashcat_ctx->user_options_extra;
 
-  cl_int CL_err;
-
+  int CU_rc;
   int CL_rc;
 
   if (hashconfig->st_hash == NULL) return 0;
 
   // init : replace hashes with selftest hash
 
-  device_param->kernel_params[15] = &device_param->d_st_digests_buf;
-  device_param->kernel_params[17] = &device_param->d_st_salts_buf;
-  device_param->kernel_params[18] = &device_param->d_st_esalts_buf;
+  if (device_param->is_cuda == true)
+  {
+    device_param->kernel_params[15] = &device_param->cuda_d_st_digests_buf;
+    device_param->kernel_params[17] = &device_param->cuda_d_st_salts_buf;
+    device_param->kernel_params[18] = &device_param->cuda_d_st_esalts_buf;
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    device_param->kernel_params[15] = &device_param->opencl_d_st_digests_buf;
+    device_param->kernel_params[17] = &device_param->opencl_d_st_salts_buf;
+    device_param->kernel_params[18] = &device_param->opencl_d_st_esalts_buf;
+  }
 
   device_param->kernel_params_buf32[31] = 1;
   device_param->kernel_params_buf32[32] = 0;
@@ -57,9 +66,19 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
     pw.pw_len = (u32) pw_len;
 
-    CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+    if (device_param->is_cuda == true)
+    {
+      CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t));
 
-    if (CL_err != CL_SUCCESS) return -1;
+      if (CU_rc == -1) return -1;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+
+      if (CL_rc == -1) return -1;
+    }
   }
   else
   {
@@ -84,9 +103,19 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
           uppercase ((u8 *) pw_ptr, pw.pw_len);
         }
 
-        CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+        if (device_param->is_cuda == true)
+        {
+          CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t));
 
-        if (CL_err != CL_SUCCESS) return -1;
+          if (CU_rc == -1) return -1;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+        }
       }
       else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
       {
@@ -136,13 +165,27 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
           comb_ptr[comb.pw_len] = 0x80;
         }
 
-        CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_combs_c, CL_TRUE, 0, 1 * sizeof (pw_t), &comb, 0, NULL, NULL);
+        if (device_param->is_cuda == true)
+        {
+          CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, &comb, 1 * sizeof (pw_t));
 
-        if (CL_err != CL_SUCCESS) return -1;
+          if (CU_rc == -1) return -1;
 
-        CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+          CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t));
 
-        if (CL_err != CL_SUCCESS) return -1;
+          if (CU_rc == -1) return -1;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, 1 * sizeof (pw_t), &comb, 0, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+
+          CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+
+          if (CL_rc == -1) return -1;
+        }
       }
       else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
       {
@@ -165,9 +208,19 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
           pw.pw_len = (u32) pw_len;
 
-          CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+          if (device_param->is_cuda == true)
+          {
+            CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t));
 
-          if (CL_err != CL_SUCCESS) return -1;
+            if (CU_rc == -1) return -1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+
+            if (CL_rc == -1) return -1;
+          }
         }
         else
         {
@@ -208,9 +261,19 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
             bf.i = byte_swap_32 (bf.i);
           }
 
-          CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_bfs_c, CL_TRUE, 0, 1 * sizeof (bf_t), &bf, 0, NULL, NULL);
+          if (device_param->is_cuda == true)
+          {
+            CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bfs_c, &bf, 1 * sizeof (bf_t));
 
-          if (CL_err != CL_SUCCESS) return -1;
+            if (CU_rc == -1) return -1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bfs_c, CL_TRUE, 0, 1 * sizeof (bf_t), &bf, 0, NULL, NULL);
+
+            if (CL_rc == -1) return -1;
+          }
 
           pw_t pw; memset (&pw, 0, sizeof (pw));
 
@@ -296,9 +359,19 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
             for (int i = 0; i < 14; i++) pw.i[i] = byte_swap_32 (pw.i[i]);
           }
 
-          CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+          if (device_param->is_cuda == true)
+          {
+            CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t));
 
-          if (CL_err != CL_SUCCESS) return -1;
+            if (CU_rc == -1) return -1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+
+            if (CL_rc == -1) return -1;
+          }
 
           highest_pw_len = pw.pw_len;
         }
@@ -316,9 +389,19 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
       pw.pw_len = (u32) pw_len;
 
-      CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+      if (device_param->is_cuda == true)
+      {
+        CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t));
 
-      if (CL_err != CL_SUCCESS) return -1;
+        if (CU_rc == -1) return -1;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL);
+
+        if (CL_rc == -1) return -1;
+      }
     }
   }
 
@@ -372,15 +455,35 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
       if (CL_rc == -1) return -1;
 
-      CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+      if (device_param->is_cuda == true)
+      {
+        CU_rc = hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks);
 
-      if (CL_rc == -1) return -1;
+        if (CU_rc == -1) return -1;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+
+        if (CL_rc == -1) return -1;
+      }
 
       module_ctx->module_hook12 (device_param, hashes->st_hook_salts_buf, 0, 1);
 
-      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+      if (device_param->is_cuda == true)
+      {
+        CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks);
 
-      if (CL_rc == -1) return -1;
+        if (CU_rc == -1) return -1;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+
+        if (CL_rc == -1) return -1;
+      }
     }
 
     const u32 salt_pos = 0;
@@ -411,15 +514,35 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
       if (CL_rc == -1) return -1;
 
-      CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+      if (device_param->is_cuda == true)
+      {
+        CU_rc = hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks);
 
-      if (CL_rc == -1) return -1;
+        if (CU_rc == -1) return -1;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+
+        if (CL_rc == -1) return -1;
+      }
 
       module_ctx->module_hook23 (device_param, hashes->st_hook_salts_buf, 0, 1);
 
-      CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+      if (device_param->is_cuda == true)
+      {
+        CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks);
 
-      if (CL_rc == -1) return -1;
+        if (CU_rc == -1) return -1;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL);
+
+        if (CL_rc == -1) return -1;
+      }
     }
 
     if (hashconfig->opts_type & OPTS_TYPE_INIT2)
@@ -492,9 +615,19 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
   u32 num_cracked;
 
-  CL_err = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_result, CL_TRUE, 0, sizeof (u32), &num_cracked, 0, NULL, NULL);
+  if (device_param->is_cuda == true)
+  {
+    CU_rc = hc_cuMemcpyDtoH (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32));
 
-  if (CL_err != CL_SUCCESS) return -1;
+    if (CU_rc == -1) return -1;
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_TRUE, 0, sizeof (u32), &num_cracked, 0, NULL, NULL);
+
+    if (CL_rc == -1) return -1;
+  }
 
   // finish : cleanup and restore
 
@@ -507,42 +640,99 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
   device_param->kernel_params_buf32[33] = 0;
   device_param->kernel_params_buf64[34] = 0;
 
-  device_param->kernel_params[15] = &device_param->d_digests_buf;
-  device_param->kernel_params[17] = &device_param->d_salt_bufs;
-  device_param->kernel_params[18] = &device_param->d_esalt_bufs;
+  if (device_param->is_cuda == true)
+  {
+    device_param->kernel_params[15] = &device_param->cuda_d_digests_buf;
+    device_param->kernel_params[17] = &device_param->cuda_d_salt_bufs;
+    device_param->kernel_params[18] = &device_param->cuda_d_esalt_bufs;
 
-  CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_pws_buf,       device_param->size_pws);      if (CL_rc == -1) return -1;
-  CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_tmps,          device_param->size_tmps);     if (CL_rc == -1) return -1;
-  CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_hooks,         device_param->size_hooks);    if (CL_rc == -1) return -1;
-  CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_plain_bufs,    device_param->size_plains);   if (CL_rc == -1) return -1;
-  CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_digests_shown, device_param->size_shown);    if (CL_rc == -1) return -1;
-  CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_result,        device_param->size_results);  if (CL_rc == -1) return -1;
+    CU_rc = run_cuda_kernel_bzero   (hashcat_ctx, device_param, device_param->cuda_d_pws_buf,         device_param->size_pws);      if (CU_rc == -1) return -1;
+    CU_rc = run_cuda_kernel_bzero   (hashcat_ctx, device_param, device_param->cuda_d_tmps,            device_param->size_tmps);     if (CU_rc == -1) return -1;
+    CU_rc = run_cuda_kernel_bzero   (hashcat_ctx, device_param, device_param->cuda_d_hooks,           device_param->size_hooks);    if (CU_rc == -1) return -1;
+    CU_rc = run_cuda_kernel_bzero   (hashcat_ctx, device_param, device_param->cuda_d_plain_bufs,      device_param->size_plains);   if (CU_rc == -1) return -1;
+    CU_rc = run_cuda_kernel_bzero   (hashcat_ctx, device_param, device_param->cuda_d_digests_shown,   device_param->size_shown);    if (CU_rc == -1) return -1;
+    CU_rc = run_cuda_kernel_bzero   (hashcat_ctx, device_param, device_param->cuda_d_result,          device_param->size_results);  if (CU_rc == -1) return -1;
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    device_param->kernel_params[15] = &device_param->opencl_d_digests_buf;
+    device_param->kernel_params[17] = &device_param->opencl_d_salt_bufs;
+    device_param->kernel_params[18] = &device_param->opencl_d_esalt_bufs;
+
+    CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_pws_buf,       device_param->size_pws);      if (CL_rc == -1) return -1;
+    CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_tmps,          device_param->size_tmps);     if (CL_rc == -1) return -1;
+    CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_hooks,         device_param->size_hooks);    if (CL_rc == -1) return -1;
+    CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_plain_bufs,    device_param->size_plains);   if (CL_rc == -1) return -1;
+    CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_digests_shown, device_param->size_shown);    if (CL_rc == -1) return -1;
+    CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_result,        device_param->size_results);  if (CL_rc == -1) return -1;
+  }
 
   if (user_options->slow_candidates == true)
   {
-    CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_rules_c, device_param->size_rules_c);
+    if (device_param->is_cuda == true)
+    {
+      CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_rules_c, device_param->size_rules_c);
 
-    if (CL_rc == -1) return -1;
+      if (CU_rc == -1) return -1;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_rules_c, device_param->size_rules_c);
+
+      if (CL_rc == -1) return -1;
+    }
   }
   else
   {
     if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
     {
-      CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_rules_c, device_param->size_rules_c);
+      if (device_param->is_cuda == true)
+      {
+        CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_rules_c, device_param->size_rules_c);
 
-      if (CL_rc == -1) return -1;
+        if (CU_rc == -1) return -1;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_rules_c, device_param->size_rules_c);
+
+        if (CL_rc == -1) return -1;
+      }
     }
     else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
     {
-      CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_combs_c, device_param->size_combs);
+      if (device_param->is_cuda == true)
+      {
+        CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_combs_c, device_param->size_combs);
 
-      if (CL_rc == -1) return -1;
+        if (CU_rc == -1) return -1;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_combs_c, device_param->size_combs);
+
+        if (CL_rc == -1) return -1;
+      }
     }
     else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
     {
-      CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_bfs_c, device_param->size_bfs);
+      if (device_param->is_cuda == true)
+      {
+        CU_rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_bfs_c, device_param->size_bfs);
 
-      if (CL_rc == -1) return -1;
+        if (CU_rc == -1) return -1;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        CL_rc = run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_bfs_c, device_param->size_bfs);
+
+        if (CL_rc == -1) return -1;
+      }
     }
   }
 
@@ -572,20 +762,27 @@ HC_API_CALL void *thread_selftest (void *p)
 
   hashcat_ctx_t *hashcat_ctx = thread_param->hashcat_ctx;
 
-  opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  if (opencl_ctx->enabled == false) return NULL;
+  if (backend_ctx->enabled == false) return NULL;
 
   user_options_t *user_options = hashcat_ctx->user_options;
 
   if (user_options->self_test_disable == true) return NULL;
 
-  hc_device_param_t *device_param = opencl_ctx->devices_param + thread_param->tid;
+  hc_device_param_t *device_param = backend_ctx->devices_param + thread_param->tid;
 
   if (device_param->skipped == true) return NULL;
 
   if (device_param->skipped_warning == true) return NULL;
 
+  if (device_param->is_cuda == true)
+  {
+    const int rc_cuCtxSetCurrent = hc_cuCtxSetCurrent (hashcat_ctx, device_param->cuda_context);
+
+    if (rc_cuCtxSetCurrent == -1) return NULL;
+  }
+
   const int rc_selftest = selftest (hashcat_ctx, device_param);
 
   if (user_options->benchmark == true)
diff --git a/src/status.c b/src/status.c
index ed4cb9281..5594c2607 100644
--- a/src/status.c
+++ b/src/status.c
@@ -200,32 +200,32 @@ double get_avg_exec_time (hc_device_param_t *device_param, const int last_num_en
 
 int status_get_device_info_cnt (const hashcat_ctx_t *hashcat_ctx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  return opencl_ctx->devices_cnt;
+  return backend_ctx->backend_devices_cnt;
 }
 
 int status_get_device_info_active (const hashcat_ctx_t *hashcat_ctx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  return opencl_ctx->devices_active;
+  return backend_ctx->backend_devices_active;
 }
 
-bool status_get_skipped_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+bool status_get_skipped_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   return device_param->skipped;
 }
 
-bool status_get_skipped_warning_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+bool status_get_skipped_warning_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   return device_param->skipped_warning;
 }
@@ -833,16 +833,16 @@ int status_get_guess_mask_length (const hashcat_ctx_t *hashcat_ctx)
   return mp_get_length (mask_ctx->mask);
 }
 
-char *status_get_guess_candidates_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+char *status_get_guess_candidates_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
   const hashconfig_t         *hashconfig         = hashcat_ctx->hashconfig;
-  const opencl_ctx_t         *opencl_ctx         = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t        *backend_ctx        = hashcat_ctx->backend_ctx;
   const status_ctx_t         *status_ctx         = hashcat_ctx->status_ctx;
   const user_options_extra_t *user_options_extra = hashcat_ctx->user_options_extra;
 
   if (status_ctx->accessible == false) return NULL;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   char *display = (char *) hcmalloc (HCBUFSIZ_TINY);
 
@@ -1410,26 +1410,26 @@ u64 status_get_progress_end_relative_skip (const hashcat_ctx_t *hashcat_ctx)
 
 double status_get_hashes_msec_all (const hashcat_ctx_t *hashcat_ctx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   double hashes_all_msec = 0;
 
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
   {
-    hashes_all_msec += status_get_hashes_msec_dev (hashcat_ctx, device_id);
+    hashes_all_msec += status_get_hashes_msec_dev (hashcat_ctx, backend_devices_idx);
   }
 
   return hashes_all_msec;
 }
 
-double status_get_hashes_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+double status_get_hashes_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   u64    speed_cnt  = 0;
   double speed_msec = 0;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   if ((device_param->skipped == false) && (device_param->skipped_warning == false))
   {
@@ -1455,16 +1455,16 @@ double status_get_hashes_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int d
   return hashes_dev_msec;
 }
 
-double status_get_hashes_msec_dev_benchmark (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+double status_get_hashes_msec_dev_benchmark (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
   // this function increases accuracy for benchmark modes
 
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   u64    speed_cnt  = 0;
   double speed_msec = 0;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   if ((device_param->skipped == false) && (device_param->skipped_warning == false))
   {
@@ -1486,23 +1486,23 @@ double status_get_hashes_msec_dev_benchmark (const hashcat_ctx_t *hashcat_ctx, c
 
 double status_get_exec_msec_all (const hashcat_ctx_t *hashcat_ctx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   double exec_all_msec = 0;
 
-  for (u32 device_id = 0; device_id < opencl_ctx->devices_cnt; device_id++)
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
   {
-    exec_all_msec += status_get_exec_msec_dev (hashcat_ctx, device_id);
+    exec_all_msec += status_get_exec_msec_dev (hashcat_ctx, backend_devices_idx);
   }
 
   return exec_all_msec;
 }
 
-double status_get_exec_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+double status_get_exec_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   double exec_dev_msec = 0;
 
@@ -1525,9 +1525,9 @@ char *status_get_speed_sec_all (const hashcat_ctx_t *hashcat_ctx)
   return display;
 }
 
-char *status_get_speed_sec_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+char *status_get_speed_sec_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const double hashes_msec_dev = status_get_hashes_msec_dev (hashcat_ctx, device_id);
+  const double hashes_msec_dev = status_get_hashes_msec_dev (hashcat_ctx, backend_devices_idx);
 
   char *display = (char *) hcmalloc (HCBUFSIZ_TINY);
 
@@ -1698,11 +1698,11 @@ char *status_get_cpt (const hashcat_ctx_t *hashcat_ctx)
   return cpt;
 }
 
-int status_get_salt_pos_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_salt_pos_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   int salt_pos = 0;
 
@@ -1714,11 +1714,11 @@ int status_get_salt_pos_dev (const hashcat_ctx_t *hashcat_ctx, const int device_
   return salt_pos;
 }
 
-int status_get_innerloop_pos_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_innerloop_pos_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   int innerloop_pos = 0;
 
@@ -1730,11 +1730,11 @@ int status_get_innerloop_pos_dev (const hashcat_ctx_t *hashcat_ctx, const int de
   return innerloop_pos;
 }
 
-int status_get_innerloop_left_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_innerloop_left_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   int innerloop_left = 0;
 
@@ -1746,11 +1746,11 @@ int status_get_innerloop_left_dev (const hashcat_ctx_t *hashcat_ctx, const int d
   return innerloop_left;
 }
 
-int status_get_iteration_pos_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_iteration_pos_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   int iteration_pos = 0;
 
@@ -1762,11 +1762,11 @@ int status_get_iteration_pos_dev (const hashcat_ctx_t *hashcat_ctx, const int de
   return iteration_pos;
 }
 
-int status_get_iteration_left_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_iteration_left_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   int iteration_left = 0;
 
@@ -1779,11 +1779,11 @@ int status_get_iteration_left_dev (const hashcat_ctx_t *hashcat_ctx, const int d
 }
 
 #ifdef WITH_BRAIN
-int status_get_brain_link_client_id_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_brain_link_client_id_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   int brain_client_id = -1;
 
@@ -1795,11 +1795,11 @@ int status_get_brain_link_client_id_dev (const hashcat_ctx_t *hashcat_ctx, const
   return brain_client_id;
 }
 
-int status_get_brain_link_status_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_brain_link_status_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   int brain_link_status_dev = 0;
 
@@ -1813,11 +1813,11 @@ int status_get_brain_link_status_dev (const hashcat_ctx_t *hashcat_ctx, const in
   return brain_link_status_dev;
 }
 
-char *status_get_brain_link_recv_bytes_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+char *status_get_brain_link_recv_bytes_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   u64 brain_link_recv_bytes = 0;
 
@@ -1833,11 +1833,11 @@ char *status_get_brain_link_recv_bytes_dev (const hashcat_ctx_t *hashcat_ctx, co
   return display;
 }
 
-char *status_get_brain_link_send_bytes_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+char *status_get_brain_link_send_bytes_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   u64 brain_link_send_bytes = 0;
 
@@ -1853,11 +1853,11 @@ char *status_get_brain_link_send_bytes_dev (const hashcat_ctx_t *hashcat_ctx, co
   return display;
 }
 
-char *status_get_brain_link_recv_bytes_sec_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+char *status_get_brain_link_recv_bytes_sec_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   u64 brain_link_recv_bytes = 0;
 
@@ -1880,11 +1880,11 @@ char *status_get_brain_link_recv_bytes_sec_dev (const hashcat_ctx_t *hashcat_ctx
   return display;
 }
 
-char *status_get_brain_link_send_bytes_sec_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+char *status_get_brain_link_send_bytes_sec_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   u64 brain_link_send_bytes = 0;
 
@@ -1908,11 +1908,11 @@ char *status_get_brain_link_send_bytes_sec_dev (const hashcat_ctx_t *hashcat_ctx
 }
 #endif
 
-char *status_get_hwmon_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+char *status_get_hwmon_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   char *output_buf = (char *) hcmalloc (HCBUFSIZ_TINY);
 
@@ -1926,12 +1926,12 @@ char *status_get_hwmon_dev (const hashcat_ctx_t *hashcat_ctx, const int device_i
 
   hc_thread_mutex_lock (status_ctx->mux_hwmon);
 
-  const int num_temperature = hm_get_temperature_with_device_id ((hashcat_ctx_t *) hashcat_ctx, device_id);
-  const int num_fanspeed    = hm_get_fanspeed_with_device_id    ((hashcat_ctx_t *) hashcat_ctx, device_id);
-  const int num_utilization = hm_get_utilization_with_device_id ((hashcat_ctx_t *) hashcat_ctx, device_id);
-  const int num_corespeed   = hm_get_corespeed_with_device_id   ((hashcat_ctx_t *) hashcat_ctx, device_id);
-  const int num_memoryspeed = hm_get_memoryspeed_with_device_id ((hashcat_ctx_t *) hashcat_ctx, device_id);
-  const int num_buslanes    = hm_get_buslanes_with_device_id    ((hashcat_ctx_t *) hashcat_ctx, device_id);
+  const int num_temperature = hm_get_temperature_with_devices_idx ((hashcat_ctx_t *) hashcat_ctx, backend_devices_idx);
+  const int num_fanspeed    = hm_get_fanspeed_with_devices_idx    ((hashcat_ctx_t *) hashcat_ctx, backend_devices_idx);
+  const int num_utilization = hm_get_utilization_with_devices_idx ((hashcat_ctx_t *) hashcat_ctx, backend_devices_idx);
+  const int num_corespeed   = hm_get_corespeed_with_devices_idx   ((hashcat_ctx_t *) hashcat_ctx, backend_devices_idx);
+  const int num_memoryspeed = hm_get_memoryspeed_with_devices_idx ((hashcat_ctx_t *) hashcat_ctx, backend_devices_idx);
+  const int num_buslanes    = hm_get_buslanes_with_devices_idx    ((hashcat_ctx_t *) hashcat_ctx, backend_devices_idx);
 
   int output_len = 0;
 
@@ -1981,11 +1981,11 @@ char *status_get_hwmon_dev (const hashcat_ctx_t *hashcat_ctx, const int device_i
   return output_buf;
 }
 
-int status_get_corespeed_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_corespeed_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   if (device_param->skipped == true) return -1;
 
@@ -1995,18 +1995,18 @@ int status_get_corespeed_dev (const hashcat_ctx_t *hashcat_ctx, const int device
 
   hc_thread_mutex_lock (status_ctx->mux_hwmon);
 
-  const int num_corespeed = hm_get_corespeed_with_device_id ((hashcat_ctx_t *) hashcat_ctx, device_id);
+  const int num_corespeed = hm_get_corespeed_with_devices_idx ((hashcat_ctx_t *) hashcat_ctx, backend_devices_idx);
 
   hc_thread_mutex_unlock (status_ctx->mux_hwmon);
 
   return num_corespeed;
 }
 
-int status_get_memoryspeed_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_memoryspeed_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   if (device_param->skipped == true) return -1;
 
@@ -2016,18 +2016,18 @@ int status_get_memoryspeed_dev (const hashcat_ctx_t *hashcat_ctx, const int devi
 
   hc_thread_mutex_lock (status_ctx->mux_hwmon);
 
-  const int num_memoryspeed = hm_get_memoryspeed_with_device_id ((hashcat_ctx_t *) hashcat_ctx, device_id);
+  const int num_memoryspeed = hm_get_memoryspeed_with_devices_idx ((hashcat_ctx_t *) hashcat_ctx, backend_devices_idx);
 
   hc_thread_mutex_unlock (status_ctx->mux_hwmon);
 
   return num_memoryspeed;
 }
 
-u64 status_get_progress_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+u64 status_get_progress_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   if (device_param->skipped == true) return 0;
 
@@ -2036,11 +2036,11 @@ u64 status_get_progress_dev (const hashcat_ctx_t *hashcat_ctx, const int device_
   return device_param->outerloop_left;
 }
 
-double status_get_runtime_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+double status_get_runtime_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   if (device_param->skipped == true) return 0;
 
@@ -2049,11 +2049,11 @@ double status_get_runtime_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int
   return device_param->outerloop_msec;
 }
 
-int status_get_kernel_accel_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_kernel_accel_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   if (device_param->skipped == true) return 0;
 
@@ -2064,11 +2064,11 @@ int status_get_kernel_accel_dev (const hashcat_ctx_t *hashcat_ctx, const int dev
   return device_param->kernel_accel;
 }
 
-int status_get_kernel_loops_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_kernel_loops_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   if (device_param->skipped == true) return 0;
 
@@ -2079,11 +2079,11 @@ int status_get_kernel_loops_dev (const hashcat_ctx_t *hashcat_ctx, const int dev
   return device_param->kernel_loops;
 }
 
-int status_get_kernel_threads_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_kernel_threads_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   if (device_param->skipped == true) return 0;
 
@@ -2092,11 +2092,11 @@ int status_get_kernel_threads_dev (const hashcat_ctx_t *hashcat_ctx, const int d
   return device_param->kernel_threads;
 }
 
-int status_get_vector_width_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
+int status_get_vector_width_dev (const hashcat_ctx_t *hashcat_ctx, const int backend_devices_idx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  hc_device_param_t *device_param = &opencl_ctx->devices_param[device_id];
+  hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
   if (device_param->skipped == true) return 0;
 
diff --git a/src/stdout.c b/src/stdout.c
index dd31d3034..d3fbcb0c7 100644
--- a/src/stdout.c
+++ b/src/stdout.c
@@ -10,7 +10,7 @@
 #include "emu_inc_rp.h"
 #include "emu_inc_rp_optimized.h"
 #include "mpsp.h"
-#include "opencl.h"
+#include "backend.h"
 #include "shared.h"
 #include "stdout.h"
 
diff --git a/src/straight.c b/src/straight.c
index 7b45825cd..0581838ec 100644
--- a/src/straight.c
+++ b/src/straight.c
@@ -214,7 +214,7 @@ int straight_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->example_hashes == true) return 0;
   if (user_options->left           == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
   if (user_options->show           == true) return 0;
   if (user_options->usage          == true) return 0;
   if (user_options->version        == true) return 0;
diff --git a/src/terminal.c b/src/terminal.c
index 94b5b223a..2ffd2df55 100644
--- a/src/terminal.c
+++ b/src/terminal.c
@@ -654,91 +654,188 @@ void example_hashes (hashcat_ctx_t *hashcat_ctx)
   }
 }
 
-void opencl_info (hashcat_ctx_t *hashcat_ctx)
+void backend_info (hashcat_ctx_t *hashcat_ctx)
 {
-  const opencl_ctx_t *opencl_ctx = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  event_log_info (hashcat_ctx, "OpenCL Info:");
-  event_log_info (hashcat_ctx, NULL);
-
-  cl_uint         platforms_cnt         = opencl_ctx->platforms_cnt;
-  cl_platform_id *platforms             = opencl_ctx->platforms;
-  char          **platforms_vendor      = opencl_ctx->platforms_vendor;
-  char          **platforms_name        = opencl_ctx->platforms_name;
-  char          **platforms_version     = opencl_ctx->platforms_version;
-  cl_uint         devices_cnt           = opencl_ctx->devices_cnt;
-
-  for (cl_uint platforms_idx = 0; platforms_idx < platforms_cnt; platforms_idx++)
+  if (backend_ctx->cuda)
   {
-    cl_platform_id platform_id       = platforms[platforms_idx];
-    char          *platform_vendor   = platforms_vendor[platforms_idx];
-    char          *platform_name     = platforms_name[platforms_idx];
-    char          *platform_version  = platforms_version[platforms_idx];
-
-    event_log_info (hashcat_ctx, "Platform ID #%u", platforms_idx + 1);
-    event_log_info (hashcat_ctx, "  Vendor  : %s",  platform_vendor);
-    event_log_info (hashcat_ctx, "  Name    : %s",  platform_name);
-    event_log_info (hashcat_ctx, "  Version : %s",  platform_version);
+    event_log_info (hashcat_ctx, "CUDA Info:");
+    event_log_info (hashcat_ctx, "==========");
     event_log_info (hashcat_ctx, NULL);
 
-    for (cl_uint devices_idx = 0; devices_idx < devices_cnt; devices_idx++)
+    int cuda_devices_cnt    = backend_ctx->cuda_devices_cnt;
+    int cuda_driver_version = backend_ctx->cuda_driver_version;
+
+    event_log_info (hashcat_ctx, "CUDA.Version.: %d.%d", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);
+    event_log_info (hashcat_ctx, NULL);
+
+    for (int cuda_devices_idx = 0; cuda_devices_idx < cuda_devices_cnt; cuda_devices_idx++)
     {
-      const hc_device_param_t *device_param = opencl_ctx->devices_param + devices_idx;
+      const int backend_devices_idx = backend_ctx->backend_device_from_cuda[cuda_devices_idx];
 
-      if (device_param->platform != platform_id) continue;
+      const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
 
-      cl_device_type device_type                = device_param->device_type;
-      cl_uint        device_vendor_id           = device_param->device_vendor_id;
-      char          *device_vendor              = device_param->device_vendor;
-      char          *device_name                = device_param->device_name;
-      u32            device_processors          = device_param->device_processors;
-      u32            device_maxclock_frequency  = device_param->device_maxclock_frequency;
-      u64            device_maxmem_alloc        = device_param->device_maxmem_alloc;
-      u64            device_global_mem          = device_param->device_global_mem;
-      char          *device_opencl_version      = device_param->device_opencl_version;
-      char          *device_version             = device_param->device_version;
-      char          *driver_version             = device_param->driver_version;
+      int   device_id                 = device_param->device_id;
+      char *device_name               = device_param->device_name;
+      u32   device_processors         = device_param->device_processors;
+      u32   device_maxclock_frequency = device_param->device_maxclock_frequency;
+      u64   device_global_mem         = device_param->device_global_mem;
 
-      event_log_info (hashcat_ctx, "  Device ID #%u",         devices_idx + 1);
-      event_log_info (hashcat_ctx, "    Type           : %s", ((device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
-      event_log_info (hashcat_ctx, "    Vendor ID      : %u", device_vendor_id);
-      event_log_info (hashcat_ctx, "    Vendor         : %s", device_vendor);
-      event_log_info (hashcat_ctx, "    Name           : %s", device_name);
-      event_log_info (hashcat_ctx, "    Version        : %s", device_version);
-      event_log_info (hashcat_ctx, "    Processor(s)   : %u", device_processors);
-      event_log_info (hashcat_ctx, "    Clock          : %u", device_maxclock_frequency);
-      event_log_info (hashcat_ctx, "    Memory         : %" PRIu64 "/%" PRIu64 " MB allocatable", device_maxmem_alloc / 1024 / 1024, device_global_mem / 1024 / 1024);
-      event_log_info (hashcat_ctx, "    OpenCL Version : %s", device_opencl_version);
-      event_log_info (hashcat_ctx, "    Driver Version : %s", driver_version);
+      if (device_param->device_id_alias_cnt)
+      {
+        event_log_info (hashcat_ctx, "Backend Device ID #%d (Alias: #%d)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+      }
+      else
+      {
+        event_log_info (hashcat_ctx, "Backend Device ID #%d", device_id + 1);
+      }
+
+      event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
+      event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
+      event_log_info (hashcat_ctx, "  Clock..........: %u", device_maxclock_frequency);
+      event_log_info (hashcat_ctx, "  Memory.........: %" PRIu64 " MB", device_global_mem / 1024 / 1024);
       event_log_info (hashcat_ctx, NULL);
     }
   }
+
+  if (backend_ctx->ocl)
+  {
+    event_log_info (hashcat_ctx, "OpenCL Info:");
+    event_log_info (hashcat_ctx, "============");
+    event_log_info (hashcat_ctx, NULL);
+
+    cl_uint   opencl_platforms_cnt         = backend_ctx->opencl_platforms_cnt;
+    cl_uint  *opencl_platforms_devices_cnt = backend_ctx->opencl_platforms_devices_cnt;
+    char    **opencl_platforms_name        = backend_ctx->opencl_platforms_name;
+    char    **opencl_platforms_vendor      = backend_ctx->opencl_platforms_vendor;
+    char    **opencl_platforms_version     = backend_ctx->opencl_platforms_version;
+
+    for (cl_uint opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
+    {
+      char     *opencl_platform_vendor       = opencl_platforms_vendor[opencl_platforms_idx];
+      char     *opencl_platform_name         = opencl_platforms_name[opencl_platforms_idx];
+      char     *opencl_platform_version      = opencl_platforms_version[opencl_platforms_idx];
+      cl_uint   opencl_platform_devices_cnt  = opencl_platforms_devices_cnt[opencl_platforms_idx];
+
+      event_log_info (hashcat_ctx, "OpenCL Platform ID #%u", opencl_platforms_idx + 1);
+      event_log_info (hashcat_ctx, "  Vendor..: %s",  opencl_platform_vendor);
+      event_log_info (hashcat_ctx, "  Name....: %s",  opencl_platform_name);
+      event_log_info (hashcat_ctx, "  Version.: %s",  opencl_platform_version);
+      event_log_info (hashcat_ctx, NULL);
+
+      for (cl_uint opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++)
+      {
+        const int backend_devices_idx = backend_ctx->backend_device_from_opencl_platform[opencl_platforms_idx][opencl_platform_devices_idx];
+
+        const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
+
+        int            device_id                  = device_param->device_id;
+        char          *device_name                = device_param->device_name;
+        u32            device_processors          = device_param->device_processors;
+        u32            device_maxclock_frequency  = device_param->device_maxclock_frequency;
+        u64            device_maxmem_alloc        = device_param->device_maxmem_alloc;
+        u64            device_global_mem          = device_param->device_global_mem;
+        cl_device_type opencl_device_type         = device_param->opencl_device_type;
+        cl_uint        opencl_device_vendor_id    = device_param->opencl_device_vendor_id;
+        char          *opencl_device_vendor       = device_param->opencl_device_vendor;
+        char          *opencl_device_c_version    = device_param->opencl_device_c_version;
+        char          *opencl_device_version      = device_param->opencl_device_version;
+        char          *opencl_driver_version      = device_param->opencl_driver_version;
+
+        if (device_param->device_id_alias_cnt)
+        {
+          event_log_info (hashcat_ctx, "  Backend Device ID #%d (Alias: #%d)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        }
+        else
+        {
+          event_log_info (hashcat_ctx, "  Backend Device ID #%d", device_id + 1);
+        }
+
+        event_log_info (hashcat_ctx, "    Type...........: %s", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
+        event_log_info (hashcat_ctx, "    Vendor.ID......: %u", opencl_device_vendor_id);
+        event_log_info (hashcat_ctx, "    Vendor.........: %s", opencl_device_vendor);
+        event_log_info (hashcat_ctx, "    Name...........: %s", device_name);
+        event_log_info (hashcat_ctx, "    Version........: %s", opencl_device_version);
+        event_log_info (hashcat_ctx, "    Processor(s)...: %u", device_processors);
+        event_log_info (hashcat_ctx, "    Clock..........: %u", device_maxclock_frequency);
+        event_log_info (hashcat_ctx, "    Memory.........: %" PRIu64 "/%" PRIu64 " MB allocatable", device_maxmem_alloc / 1024 / 1024, device_global_mem / 1024 / 1024);
+        event_log_info (hashcat_ctx, "    OpenCL.Version.: %s", opencl_device_c_version);
+        event_log_info (hashcat_ctx, "    Driver.Version.: %s", opencl_driver_version);
+        event_log_info (hashcat_ctx, NULL);
+      }
+    }
+  }
 }
 
-void opencl_info_compact (hashcat_ctx_t *hashcat_ctx)
+void backend_info_compact (hashcat_ctx_t *hashcat_ctx)
 {
-  const opencl_ctx_t   *opencl_ctx   = hashcat_ctx->opencl_ctx;
+  const backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
   const user_options_t *user_options = hashcat_ctx->user_options;
 
   if (user_options->quiet            == true) return;
   if (user_options->machine_readable == true) return;
   if (user_options->status_json      == true) return;
 
-  cl_uint         platforms_cnt         = opencl_ctx->platforms_cnt;
-  cl_platform_id *platforms             = opencl_ctx->platforms;
-  char          **platforms_vendor      = opencl_ctx->platforms_vendor;
-  bool           *platforms_skipped     = opencl_ctx->platforms_skipped;
-  cl_uint         devices_cnt           = opencl_ctx->devices_cnt;
-
-  for (cl_uint platforms_idx = 0; platforms_idx < platforms_cnt; platforms_idx++)
+  if (backend_ctx->cuda)
   {
-    cl_platform_id platform_id       = platforms[platforms_idx];
-    char          *platform_vendor   = platforms_vendor[platforms_idx];
-    bool           platform_skipped  = platforms_skipped[platforms_idx];
+    int cuda_devices_cnt    = backend_ctx->cuda_devices_cnt;
+    int cuda_driver_version = backend_ctx->cuda_driver_version;
 
-    if (platform_skipped == false)
+    const size_t len = event_log_info (hashcat_ctx, "CUDA API (CUDA %d.%d)", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);
+
+    char line[HCBUFSIZ_TINY];
+
+    memset (line, '=', len);
+
+    line[len] = 0;
+
+    event_log_info (hashcat_ctx, "%s", line);
+
+    for (int cuda_devices_idx = 0; cuda_devices_idx < cuda_devices_cnt; cuda_devices_idx++)
     {
-      const size_t len = event_log_info (hashcat_ctx, "OpenCL Platform #%u: %s", platforms_idx + 1, platform_vendor);
+      const int backend_devices_idx = backend_ctx->backend_device_from_cuda[cuda_devices_idx];
+
+      const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
+
+      int   device_id         = device_param->device_id;
+      char *device_name       = device_param->device_name;
+      u32   device_processors = device_param->device_processors;
+      u64   device_global_mem = device_param->device_global_mem;
+
+      if ((device_param->skipped == false) && (device_param->skipped_warning == false))
+      {
+        event_log_info (hashcat_ctx, "* Device #%u: %s, %" PRIu64 " MB, %uMCU",
+                  device_id + 1,
+                  device_name,
+                  device_global_mem   / 1024 / 1024,
+                  device_processors);
+      }
+      else
+      {
+        event_log_info (hashcat_ctx, "* Device #%u: %s, skipped",
+                  device_id + 1,
+                  device_name);
+      }
+    }
+
+    event_log_info (hashcat_ctx, NULL);
+  }
+
+  if (backend_ctx->ocl)
+  {
+    cl_uint   opencl_platforms_cnt         = backend_ctx->opencl_platforms_cnt;
+    cl_uint  *opencl_platforms_devices_cnt = backend_ctx->opencl_platforms_devices_cnt;
+    char    **opencl_platforms_vendor      = backend_ctx->opencl_platforms_vendor;
+    char    **opencl_platforms_version     = backend_ctx->opencl_platforms_version;
+
+    for (cl_uint opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
+    {
+      char     *opencl_platform_vendor       = opencl_platforms_vendor[opencl_platforms_idx];
+      char     *opencl_platform_version      = opencl_platforms_version[opencl_platforms_idx];
+      cl_uint   opencl_platform_devices_cnt  = opencl_platforms_devices_cnt[opencl_platforms_idx];
+
+      const size_t len = event_log_info (hashcat_ctx, "OpenCL API (%s) - Platform #%u [%s]", opencl_platform_version, opencl_platforms_idx + 1, opencl_platform_vendor);
 
       char line[HCBUFSIZ_TINY];
 
@@ -747,41 +844,38 @@ void opencl_info_compact (hashcat_ctx_t *hashcat_ctx)
       line[len] = 0;
 
       event_log_info (hashcat_ctx, "%s", line);
-    }
-    else
-    {
-      event_log_info (hashcat_ctx, "OpenCL Platform #%u: %s, skipped or no OpenCL compatible devices found.", platforms_idx + 1, platform_vendor);
-    }
 
-    for (cl_uint devices_idx = 0; devices_idx < devices_cnt; devices_idx++)
-    {
-      const hc_device_param_t *device_param = opencl_ctx->devices_param + devices_idx;
-
-      if (device_param->platform != platform_id) continue;
-
-      char *device_name         = device_param->device_name;
-      u32   device_processors   = device_param->device_processors;
-      u64   device_maxmem_alloc = device_param->device_maxmem_alloc;
-      u64   device_global_mem   = device_param->device_global_mem;
-
-      if ((device_param->skipped == false) && (device_param->skipped_warning == false))
+      for (cl_uint opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++)
       {
-        event_log_info (hashcat_ctx, "* Device #%u: %s, %" PRIu64 "/%" PRIu64 " MB allocatable, %uMCU",
-                  devices_idx + 1,
-                  device_name,
-                  device_maxmem_alloc / 1024 / 1024,
-                  device_global_mem   / 1024 / 1024,
-                  device_processors);
-      }
-      else
-      {
-        event_log_info (hashcat_ctx, "* Device #%u: %s, skipped.",
-                  devices_idx + 1,
-                  device_name);
-      }
-    }
+        const int backend_devices_idx = backend_ctx->backend_device_from_opencl_platform[opencl_platforms_idx][opencl_platform_devices_idx];
 
-    event_log_info (hashcat_ctx, NULL);
+        const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
+
+        int   device_id           = device_param->device_id;
+        char *device_name         = device_param->device_name;
+        u32   device_processors   = device_param->device_processors;
+        u64   device_maxmem_alloc = device_param->device_maxmem_alloc;
+        u64   device_global_mem   = device_param->device_global_mem;
+
+        if ((device_param->skipped == false) && (device_param->skipped_warning == false))
+        {
+          event_log_info (hashcat_ctx, "* Device #%u: %s, %" PRIu64 "/%" PRIu64 " MB allocatable, %uMCU",
+                    device_id + 1,
+                    device_name,
+                    device_maxmem_alloc / 1024 / 1024,
+                    device_global_mem   / 1024 / 1024,
+                    device_processors);
+        }
+        else
+        {
+          event_log_info (hashcat_ctx, "* Device #%u: %s, skipped",
+                    device_id + 1,
+                    device_name);
+        }
+      }
+
+      event_log_info (hashcat_ctx, NULL);
+    }
   }
 }
 
@@ -851,7 +945,7 @@ void status_display_machine_readable (hashcat_ctx_t *hashcat_ctx)
 
       if (device_info->skipped_warning_dev == true) continue;
 
-      const int temp = hm_get_temperature_with_device_id (hashcat_ctx, device_id);
+      const int temp = hm_get_temperature_with_devices_idx (hashcat_ctx, device_id);
 
       printf ("%d\t", temp);
     }
@@ -871,7 +965,7 @@ void status_display_machine_readable (hashcat_ctx_t *hashcat_ctx)
 
     // ok, little cheat here again...
 
-    const int util = hm_get_utilization_with_device_id (hashcat_ctx, device_id);
+    const int util = hm_get_utilization_with_devices_idx (hashcat_ctx, device_id);
 
     printf ("%d\t", util);
   }
@@ -948,12 +1042,12 @@ void status_display_status_json (hashcat_ctx_t *hashcat_ctx)
 
     if (hwmon_ctx->enabled == true)
     {
-      const int temp = hm_get_temperature_with_device_id (hashcat_ctx, device_id);
+      const int temp = hm_get_temperature_with_devices_idx (hashcat_ctx, device_id);
 
       printf (" \"temp\": %d,", temp);
     }
 
-    const int util = hm_get_utilization_with_device_id (hashcat_ctx, device_id);
+    const int util = hm_get_utilization_with_devices_idx (hashcat_ctx, device_id);
 
     printf (" \"util\": %d }", util);
 
diff --git a/src/tuningdb.c b/src/tuningdb.c
index 91425f306..db88cf57a 100644
--- a/src/tuningdb.c
+++ b/src/tuningdb.c
@@ -57,7 +57,7 @@ int tuning_db_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->example_hashes == true) return 0;
   if (user_options->keyspace       == true) return 0;
   if (user_options->left           == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
   if (user_options->show           == true) return 0;
   if (user_options->usage          == true) return 0;
   if (user_options->version        == true) return 0;
diff --git a/src/usage.c b/src/usage.c
index 51e764819..02cf4b179 100644
--- a/src/usage.c
+++ b/src/usage.c
@@ -89,16 +89,15 @@ static const char *const USAGE_BIG_PRE_HASHMODES[] =
   "     --bitmap-max               | Num  | Sets maximum bits allowed for bitmaps to X           | --bitmap-max=24",
   "     --cpu-affinity             | Str  | Locks to CPU devices, separated with commas          | --cpu-affinity=1,2,3",
   "     --example-hashes           |      | Show an example hash for each hash-mode              |",
-  " -I, --opencl-info              |      | Show info about detected OpenCL platforms/devices    | -I",
-  "     --opencl-platforms         | Str  | OpenCL platforms to use, separated with commas       | --opencl-platforms=2",
-  " -d, --opencl-devices           | Str  | OpenCL devices to use, separated with commas         | -d 1",
+  " -I, --backend-info             |      | Show info about detected backend API devices         | -I",
+  " -d, --backend-devices          | Str  | Backend devices to use, separated with commas        | -d 1",
   " -D, --opencl-device-types      | Str  | OpenCL device-types to use, separated with commas    | -D 1",
-  "     --opencl-vector-width      | Num  | Manually override OpenCL vector-width to X           | --opencl-vector=4",
   " -O, --optimized-kernel-enable  |      | Enable optimized kernels (limits password length)    |",
   " -w, --workload-profile         | Num  | Enable a specific workload profile, see pool below   | -w 3",
   " -n, --kernel-accel             | Num  | Manual workload tuning, set outerloop step size to X | -n 64",
   " -u, --kernel-loops             | Num  | Manual workload tuning, set innerloop step size to X | -u 256",
   " -T, --kernel-threads           | Num  | Manual workload tuning, set thread count to X        | -T 64",
+  "     --backend-vector-width     | Num  | Manually override backend vector-width to X          | --backend-vector=4",
   "     --spin-damp                | Num  | Use CPU for device synchronization, in percent       | --spin-damp=50",
   "     --hwmon-disable            |      | Disable temperature and fanspeed reads and triggers  |",
   "     --hwmon-temp-abort         | Num  | Abort if temperature reaches X degrees Celsius       | --hwmon-temp-abort=100",
@@ -199,7 +198,7 @@ static const char *const USAGE_BIG_POST_HASHMODES[] =
   "  d | 0123456789",
   "  h | 0123456789abcdef",
   "  H | 0123456789ABCDEF",
-  "  s |  !\"#$%%&'()*+,-./:;<=>?@[\\]^_`{|}~",
+  "  s |  !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
   "  a | ?l?u?d?s",
   "  b | 0x00 - 0xff",
   "",
diff --git a/src/user_options.c b/src/user_options.c
index 755e636db..89a4b14b2 100644
--- a/src/user_options.c
+++ b/src/user_options.c
@@ -11,7 +11,7 @@
 #include "interface.h"
 #include "shared.h"
 #include "usage.h"
-#include "opencl.h"
+#include "backend.h"
 #include "user_options.h"
 
 #ifdef WITH_BRAIN
@@ -28,6 +28,9 @@ static const struct option long_options[] =
 {
   {"advice-disable",            no_argument,       NULL, IDX_ADVICE_DISABLE},
   {"attack-mode",               required_argument, NULL, IDX_ATTACK_MODE},
+  {"backend-devices",           required_argument, NULL, IDX_BACKEND_DEVICES},
+  {"backend-info",              no_argument,       NULL, IDX_BACKEND_INFO},
+  {"backend-vector-width",      required_argument, NULL, IDX_BACKEND_VECTOR_WIDTH},
   {"benchmark-all",             no_argument,       NULL, IDX_BENCHMARK_ALL},
   {"benchmark",                 no_argument,       NULL, IDX_BENCHMARK},
   {"bitmap-max",                required_argument, NULL, IDX_BITMAP_MAX},
@@ -75,11 +78,7 @@ static const struct option long_options[] =
   {"markov-hcstat2",            required_argument, NULL, IDX_MARKOV_HCSTAT2},
   {"markov-threshold",          required_argument, NULL, IDX_MARKOV_THRESHOLD},
   {"nonce-error-corrections",   required_argument, NULL, IDX_NONCE_ERROR_CORRECTIONS},
-  {"opencl-devices",            required_argument, NULL, IDX_OPENCL_DEVICES},
   {"opencl-device-types",       required_argument, NULL, IDX_OPENCL_DEVICE_TYPES},
-  {"opencl-info",               no_argument,       NULL, IDX_OPENCL_INFO},
-  {"opencl-platforms",          required_argument, NULL, IDX_OPENCL_PLATFORMS},
-  {"opencl-vector-width",       required_argument, NULL, IDX_OPENCL_VECTOR_WIDTH},
   {"optimized-kernel-enable",   no_argument,       NULL, IDX_OPTIMIZED_KERNEL_ENABLE},
   {"outfile-autohex-disable",   no_argument,       NULL, IDX_OUTFILE_AUTOHEX_DISABLE},
   {"outfile-check-dir",         required_argument, NULL, IDX_OUTFILE_CHECK_DIR},
@@ -152,6 +151,9 @@ int user_options_init (hashcat_ctx_t *hashcat_ctx)
 
   user_options->advice_disable            = ADVICE_DISABLE;
   user_options->attack_mode               = ATTACK_MODE;
+  user_options->backend_devices           = NULL;
+  user_options->backend_info              = BACKEND_INFO;
+  user_options->backend_vector_width      = BACKEND_VECTOR_WIDTH;
   user_options->benchmark_all             = BENCHMARK_ALL;
   user_options->benchmark                 = BENCHMARK;
   user_options->bitmap_max                = BITMAP_MAX;
@@ -203,11 +205,7 @@ int user_options_init (hashcat_ctx_t *hashcat_ctx)
   user_options->markov_hcstat2            = NULL;
   user_options->markov_threshold          = MARKOV_THRESHOLD;
   user_options->nonce_error_corrections   = NONCE_ERROR_CORRECTIONS;
-  user_options->opencl_devices            = NULL;
   user_options->opencl_device_types       = NULL;
-  user_options->opencl_info               = OPENCL_INFO;
-  user_options->opencl_platforms          = NULL;
-  user_options->opencl_vector_width       = OPENCL_VECTOR_WIDTH;
   user_options->optimized_kernel_enable   = OPTIMIZED_KERNEL_ENABLE;
   user_options->outfile_autohex           = OUTFILE_AUTOHEX;
   user_options->outfile_check_dir         = NULL;
@@ -306,7 +304,7 @@ int user_options_getopt (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
       case IDX_MARKOV_THRESHOLD:
       case IDX_OUTFILE_FORMAT:
       case IDX_OUTFILE_CHECK_TIMER:
-      case IDX_OPENCL_VECTOR_WIDTH:
+      case IDX_BACKEND_VECTOR_WIDTH:
       case IDX_WORKLOAD_PROFILE:
       case IDX_KERNEL_ACCEL:
       case IDX_KERNEL_LOOPS:
@@ -425,12 +423,11 @@ int user_options_getopt (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
       case IDX_HEX_SALT:                  user_options->hex_salt                  = true;                            break;
       case IDX_HEX_WORDLIST:              user_options->hex_wordlist              = true;                            break;
       case IDX_CPU_AFFINITY:              user_options->cpu_affinity              = optarg;                          break;
-      case IDX_OPENCL_INFO:               user_options->opencl_info               = true;                            break;
-      case IDX_OPENCL_DEVICES:            user_options->opencl_devices            = optarg;                          break;
-      case IDX_OPENCL_PLATFORMS:          user_options->opencl_platforms          = optarg;                          break;
+      case IDX_BACKEND_INFO:              user_options->backend_info              = true;                            break;
+      case IDX_BACKEND_DEVICES:           user_options->backend_devices           = optarg;                          break;
+      case IDX_BACKEND_VECTOR_WIDTH:      user_options->backend_vector_width      = hc_strtoul (optarg, NULL, 10);
+                                          user_options->backend_vector_width_chgd = true;                            break;
       case IDX_OPENCL_DEVICE_TYPES:       user_options->opencl_device_types       = optarg;                          break;
-      case IDX_OPENCL_VECTOR_WIDTH:       user_options->opencl_vector_width       = hc_strtoul (optarg, NULL, 10);
-                                          user_options->opencl_vector_width_chgd  = true;                            break;
       case IDX_OPTIMIZED_KERNEL_ENABLE:   user_options->optimized_kernel_enable   = true;                            break;
       case IDX_WORKLOAD_PROFILE:          user_options->workload_profile          = hc_strtoul (optarg, NULL, 10);
                                           user_options->workload_profile_chgd     = true;                            break;
@@ -859,11 +856,11 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx)
     return -1;
   }
 
-  if (user_options->opencl_vector_width_chgd == true)
+  if (user_options->backend_vector_width_chgd == true)
   {
-    if (is_power_of_2 (user_options->opencl_vector_width) == false || user_options->opencl_vector_width > 16)
+    if (is_power_of_2 (user_options->backend_vector_width) == false || user_options->backend_vector_width > 16)
     {
-      event_log_error (hashcat_ctx, "opencl-vector-width %u is not allowed.", user_options->opencl_vector_width);
+      event_log_error (hashcat_ctx, "backend-vector-width %u is not allowed.", user_options->backend_vector_width);
 
       return -1;
     }
@@ -1090,21 +1087,11 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx)
     }
   }
 
-  if (user_options->opencl_platforms != NULL)
+  if (user_options->backend_devices != NULL)
   {
-    if (strlen (user_options->opencl_platforms) == 0)
+    if (strlen (user_options->backend_devices) == 0)
     {
-      event_log_error (hashcat_ctx, "Invalid --opencl-platforms value - must not be empty.");
-
-      return -1;
-    }
-  }
-
-  if (user_options->opencl_devices != NULL)
-  {
-    if (strlen (user_options->opencl_devices) == 0)
-    {
-      event_log_error (hashcat_ctx, "Invalid --opencl-devices value - must not be empty.");
+      event_log_error (hashcat_ctx, "Invalid --backend-devices value - must not be empty.");
 
       return -1;
     }
@@ -1233,7 +1220,7 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx)
       show_error = false;
     }
   }
-  else if (user_options->opencl_info == true)
+  else if (user_options->backend_info == true)
   {
     if (user_options->hc_argc == 0)
     {
@@ -1433,9 +1420,9 @@ void user_options_session_auto (hashcat_ctx_t *hashcat_ctx)
       user_options->session = "stdout";
     }
 
-    if (user_options->opencl_info == true)
+    if (user_options->backend_info == true)
     {
-      user_options->session = "opencl_info";
+      user_options->session = "backend_info";
     }
 
     if (user_options->show == true)
@@ -1482,7 +1469,7 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
   }
 
   if (user_options->example_hashes  == true
-   || user_options->opencl_info     == true
+   || user_options->backend_info    == true
    || user_options->keyspace        == true
    || user_options->speed_only      == true
    || user_options->progress_only   == true
@@ -1559,17 +1546,17 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->slow_candidates == true)
   {
-    user_options->opencl_vector_width = 1;
+    user_options->backend_vector_width = 1;
   }
 
   if (user_options->stdout_flag == true)
   {
-    user_options->force               = true;
-    user_options->hash_mode           = 2000;
-    user_options->kernel_accel        = 1024;
-    user_options->opencl_vector_width = 1;
-    user_options->outfile_format      = OUTFILE_FMT_PLAIN;
-    user_options->quiet               = true;
+    user_options->force                 = true;
+    user_options->hash_mode             = 2000;
+    user_options->kernel_accel          = 1024;
+    user_options->backend_vector_width  = 1;
+    user_options->outfile_format        = OUTFILE_FMT_PLAIN;
+    user_options->quiet                 = true;
 
     if (user_options->attack_mode == ATTACK_MODE_STRAIGHT)
     {
@@ -1593,11 +1580,10 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
     }
   }
 
-  if (user_options->opencl_info == true)
+  if (user_options->backend_info == true)
   {
-    user_options->opencl_devices      = NULL;
+    user_options->backend_devices     = NULL;
     user_options->opencl_device_types = hcstrdup ("1,2,3");
-    user_options->opencl_platforms    = NULL;
     user_options->quiet               = true;
   }
 
@@ -1652,7 +1638,7 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
     {
 
     }
-    else if (user_options->opencl_info == true)
+    else if (user_options->backend_info == true)
     {
 
     }
@@ -1732,9 +1718,9 @@ void user_options_info (hashcat_ctx_t *hashcat_ctx)
       event_log_info (hashcat_ctx, "* --force");
     }
 
-    if (user_options->opencl_devices)
+    if (user_options->backend_devices)
     {
-      event_log_info (hashcat_ctx, "* --opencl-devices=%s", user_options->opencl_devices);
+      event_log_info (hashcat_ctx, "* --backend-devices=%s", user_options->backend_devices);
     }
 
     if (user_options->opencl_device_types)
@@ -1742,19 +1728,14 @@ void user_options_info (hashcat_ctx_t *hashcat_ctx)
       event_log_info (hashcat_ctx, "* --opencl-device-types=%s", user_options->opencl_device_types);
     }
 
-    if (user_options->opencl_platforms)
-    {
-      event_log_info (hashcat_ctx, "* --opencl-platforms=%s", user_options->opencl_platforms);
-    }
-
     if (user_options->optimized_kernel_enable == true)
     {
       event_log_info (hashcat_ctx, "* --optimized-kernel-enable");
     }
 
-    if (user_options->opencl_vector_width_chgd == true)
+    if (user_options->backend_vector_width_chgd == true)
     {
-      event_log_info (hashcat_ctx, "* --opencl-vector-width=%u", user_options->opencl_vector_width);
+      event_log_info (hashcat_ctx, "* --backend-vector-width=%u", user_options->backend_vector_width);
     }
 
     if (user_options->kernel_accel_chgd == true)
@@ -1791,9 +1772,9 @@ void user_options_info (hashcat_ctx_t *hashcat_ctx)
       event_log_info (hashcat_ctx, "# option: --force");
     }
 
-    if (user_options->opencl_devices)
+    if (user_options->backend_devices)
     {
-      event_log_info (hashcat_ctx, "# option: --opencl-devices=%s", user_options->opencl_devices);
+      event_log_info (hashcat_ctx, "# option: --backend-devices=%s", user_options->backend_devices);
     }
 
     if (user_options->opencl_device_types)
@@ -1801,19 +1782,14 @@ void user_options_info (hashcat_ctx_t *hashcat_ctx)
       event_log_info (hashcat_ctx, "# option: --opencl-device-types=%s", user_options->opencl_device_types);
     }
 
-    if (user_options->opencl_platforms)
-    {
-      event_log_info (hashcat_ctx, "* option: --opencl-platforms=%s", user_options->opencl_platforms);
-    }
-
     if (user_options->optimized_kernel_enable == true)
     {
       event_log_info (hashcat_ctx, "# option: --optimized-kernel-enable");
     }
 
-    if (user_options->opencl_vector_width_chgd == true)
+    if (user_options->backend_vector_width_chgd == true)
     {
-      event_log_info (hashcat_ctx, "# option: --opencl-vector-width=%u", user_options->opencl_vector_width);
+      event_log_info (hashcat_ctx, "# option: --backend-vector-width=%u", user_options->backend_vector_width);
     }
 
     if (user_options->kernel_accel_chgd == true)
@@ -1875,7 +1851,7 @@ void user_options_extra_init (hashcat_ctx_t *hashcat_ctx)
   {
 
   }
-  else if (user_options->opencl_info == true)
+  else if (user_options->backend_info == true)
   {
 
   }
@@ -2587,7 +2563,7 @@ int user_options_check_files (hashcat_ctx_t *hashcat_ctx)
 
   hcfree (modulefile);
 
-  // same check but for an OpenCL kernel
+  // same check but for an backend kernel
 
   char *kernelfile = (char *) hcmalloc (HCBUFSIZ_TINY);
 
@@ -2718,9 +2694,8 @@ void user_options_logger (hashcat_ctx_t *hashcat_ctx)
   logfile_top_string (user_options->induction_dir);
   logfile_top_string (user_options->keyboard_layout_mapping);
   logfile_top_string (user_options->markov_hcstat2);
-  logfile_top_string (user_options->opencl_devices);
+  logfile_top_string (user_options->backend_devices);
   logfile_top_string (user_options->opencl_device_types);
-  logfile_top_string (user_options->opencl_platforms);
   logfile_top_string (user_options->outfile);
   logfile_top_string (user_options->outfile_check_dir);
   logfile_top_string (user_options->potfile_path);
@@ -2765,8 +2740,8 @@ void user_options_logger (hashcat_ctx_t *hashcat_ctx)
   logfile_top_uint   (user_options->markov_classic);
   logfile_top_uint   (user_options->markov_disable);
   logfile_top_uint   (user_options->markov_threshold);
-  logfile_top_uint   (user_options->opencl_info);
-  logfile_top_uint   (user_options->opencl_vector_width);
+  logfile_top_uint   (user_options->backend_info);
+  logfile_top_uint   (user_options->backend_vector_width);
   logfile_top_uint   (user_options->optimized_kernel_enable);
   logfile_top_uint   (user_options->outfile_autohex);
   logfile_top_uint   (user_options->outfile_check_timer);
diff --git a/src/wordlist.c b/src/wordlist.c
index bafc9a2dd..34b595729 100644
--- a/src/wordlist.c
+++ b/src/wordlist.c
@@ -560,7 +560,7 @@ int wl_data_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->benchmark      == true) return 0;
   if (user_options->example_hashes == true) return 0;
   if (user_options->left           == true) return 0;
-  if (user_options->opencl_info    == true) return 0;
+  if (user_options->backend_info   == true) return 0;
   if (user_options->usage          == true) return 0;
   if (user_options->version        == true) return 0;
 
diff --git a/tools/test.sh b/tools/test.sh
index 6f8b28d5b..d0546a219 100755
--- a/tools/test.sh
+++ b/tools/test.sh
@@ -2477,7 +2477,7 @@ cat << EOF
 
 OPTIONS:
 
-  -V    OpenCL vector-width (either 1, 2, 4 or 8), overrides value from device query :
+  -V    Backend vector-width (either 1, 2, 4 or 8), overrides value from device query :
         '1'         => vector-width 1
         '2'         => vector-width 2 (default)
         '4'         => vector-width 4
@@ -2507,7 +2507,7 @@ OPTIONS:
         'linux'     => Linux operating system (use .bin file extension)
         'macos'     => macOS operating system (use .app file extension)
 
-  -d    Select the OpenCL device :
+  -d    Select the Backend device :
         (int)[,int] => comma separated list of devices (default : 1)
 
   -D    Select the OpenCL device types :
@@ -2866,7 +2866,7 @@ if [ "${PACKAGE}" -eq 0 -o -z "${PACKAGE_FOLDER}" ]; then
           fi
 
           VECTOR=${CUR_WIDTH}
-          OPTS="${OPTS_OLD} --opencl-vector-width ${VECTOR}"
+          OPTS="${OPTS_OLD} --backend-vector-width ${VECTOR}"
 
           if [[ ${IS_SLOW} -eq 1 ]]; then