From ea7b74389f6c3afab941fd4e593ec3c111758921 Mon Sep 17 00:00:00 2001
From: reger-men <adil-lashab@hotmail.com>
Date: Fri, 9 Jul 2021 03:50:40 +0000
Subject: [PATCH] First draft HIP Version

---
 OpenCL/inc_common.cl          |    20 +-
 OpenCL/inc_common.h           |     4 +-
 OpenCL/inc_platform.cl        |     6 +-
 OpenCL/inc_platform.h         |     6 +-
 OpenCL/inc_types.h            |     7 +-
 OpenCL/inc_vendor.h           |    16 +-
 OpenCL/m01700_a0-optimized.cl |     2 +-
 OpenCL/m01700_a1-optimized.cl |     2 +-
 OpenCL/m01700_a3-optimized.cl |     2 +-
 OpenCL/m01710_a0-optimized.cl |     2 +-
 OpenCL/m01710_a1-optimized.cl |     2 +-
 OpenCL/m01710_a3-optimized.cl |     2 +-
 OpenCL/m01720_a0-optimized.cl |     2 +-
 OpenCL/m01720_a1-optimized.cl |     2 +-
 OpenCL/m01720_a3-optimized.cl |     2 +-
 OpenCL/m01730_a0-optimized.cl |     2 +-
 OpenCL/m01730_a1-optimized.cl |     2 +-
 OpenCL/m01730_a3-optimized.cl |     2 +-
 OpenCL/m01740_a0-optimized.cl |     2 +-
 OpenCL/m01740_a1-optimized.cl |     2 +-
 OpenCL/m01740_a3-optimized.cl |     2 +-
 OpenCL/m02500-pure.cl         |     2 +-
 OpenCL/m08000_a0-optimized.cl |     4 +-
 OpenCL/m08000_a1-optimized.cl |     4 +-
 OpenCL/m08000_a3-optimized.cl |     4 +-
 OpenCL/m08900-pure.cl         |    10 +-
 OpenCL/m10800_a0-optimized.cl |     2 +-
 OpenCL/m10800_a1-optimized.cl |     2 +-
 OpenCL/m10800_a3-optimized.cl |     2 +-
 OpenCL/m15700-pure.cl         |    10 +-
 OpenCL/m21000_a0-optimized.cl |     2 +-
 OpenCL/m21000_a1-optimized.cl |     4 +-
 OpenCL/m21000_a3-optimized.cl |     4 +-
 OpenCL/m22000-pure.cl         |     2 +-
 OpenCL/m22001-pure.cl         |     2 +-
 OpenCL/m22200_a0-optimized.cl |     2 +-
 OpenCL/m22200_a1-optimized.cl |     2 +-
 OpenCL/m22200_a3-optimized.cl |     2 +-
 OpenCL/m22700-pure.cl         |    10 +-
 include/backend.h             |    60 +
 include/ext_hip.h             |  1131 +++
 include/ext_hiprtc.h          |    87 +
 include/types.h               |    96 +
 src/Makefile                  |     4 +-
 src/backend.c                 | 13605 ++++++++++++++++++++------------
 src/ext_hip.c                 |     8 +
 src/ext_hiprtc.c              |    27 +
 src/selftest.c                |     5 +-
 src/terminal.c                |    53 +
 src/user_options.c            |     3 +
 50 files changed, 10199 insertions(+), 5039 deletions(-)
 create mode 100644 include/ext_hip.h
 create mode 100644 include/ext_hiprtc.h
 create mode 100644 src/ext_hip.c
 create mode 100644 src/ext_hiprtc.c

diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl
index ee008e083..51b83dd54 100644
--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@@ -3,6 +3,10 @@
  * License.....: MIT
  */
 
+#ifdef IS_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 #include "inc_vendor.h"
 #include "inc_types.h"
 #include "inc_platform.h"
@@ -879,7 +883,7 @@ DECLSPEC u32x hc_rotl32 (const u32x a, const int n)
 {
   #if   defined _CPU_OPENCL_EMU_H
   return rotl32 (a, n);
-  #elif defined IS_CUDA
+  #elif defined IS_CUDA || defined IS_HIP
   return rotl32 (a, n);
   #else
   #ifdef USE_ROTATE
@@ -894,7 +898,7 @@ DECLSPEC u32x hc_rotr32 (const u32x a, const int n)
 {
   #if   defined _CPU_OPENCL_EMU_H
   return rotr32 (a, n);
-  #elif defined IS_CUDA
+  #elif defined IS_CUDA || defined IS_HIP
   return rotr32 (a, n);
   #else
   #ifdef USE_ROTATE
@@ -909,7 +913,7 @@ DECLSPEC u32 hc_rotl32_S (const u32 a, const int n)
 {
   #if   defined _CPU_OPENCL_EMU_H
   return rotl32 (a, n);
-  #elif defined IS_CUDA
+  #elif defined IS_CUDA || defined IS_HIP
   return rotl32_S (a, n);
   #else
   #ifdef USE_ROTATE
@@ -924,7 +928,7 @@ DECLSPEC u32 hc_rotr32_S (const u32 a, const int n)
 {
   #if   defined _CPU_OPENCL_EMU_H
   return rotr32 (a, n);
-  #elif defined IS_CUDA
+  #elif defined IS_CUDA || defined IS_HIP
   return rotr32_S (a, n);
   #else
   #ifdef USE_ROTATE
@@ -939,7 +943,7 @@ DECLSPEC u64x hc_rotl64 (const u64x a, const int n)
 {
   #if   defined _CPU_OPENCL_EMU_H
   return rotl64 (a, n);
-  #elif defined IS_CUDA
+  #elif defined IS_CUDA || defined IS_HIP
   return rotl64 (a, n);
   #elif defined IS_AMD
   return rotl64 (a, n);
@@ -956,7 +960,7 @@ DECLSPEC u64x hc_rotr64 (const u64x a, const int n)
 {
   #if   defined _CPU_OPENCL_EMU_H
   return rotr64 (a, n);
-  #elif defined IS_CUDA
+  #elif defined IS_CUDA || defined IS_HIP
   return rotr64 (a, n);
   #elif defined IS_AMD
   return rotr64 (a, n);
@@ -973,7 +977,7 @@ DECLSPEC u64 hc_rotl64_S (const u64 a, const int n)
 {
   #if   defined _CPU_OPENCL_EMU_H
   return rotl64 (a, n);
-  #elif defined IS_CUDA
+  #elif defined IS_CUDA || defined IS_HIP
   return rotl64_S (a, n);
   #elif defined IS_AMD
   return rotl64_S (a, n);
@@ -990,7 +994,7 @@ DECLSPEC u64 hc_rotr64_S (const u64 a, const int n)
 {
   #if   defined _CPU_OPENCL_EMU_H
   return rotr64 (a, n);
-  #elif defined IS_CUDA
+  #elif defined IS_CUDA || defined IS_HIP
   return rotr64_S (a, n);
   #elif defined IS_AMD
   return rotr64_S (a, n);
diff --git a/OpenCL/inc_common.h b/OpenCL/inc_common.h
index 6e39b2ca3..fb65e2095 100644
--- a/OpenCL/inc_common.h
+++ b/OpenCL/inc_common.h
@@ -26,7 +26,7 @@
  *   - P19: Type of the esalt_bufs structure with additional data, or void.
  */
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 #define KERN_ATTR(p2,p4,p5,p6,p19)                              \
   MAYBE_UNUSED GLOBAL_AS       pw_t          *pws,              \
   MAYBE_UNUSED p2        const kernel_rule_t *g_rules_buf,      \
@@ -109,7 +109,7 @@
  * do not use rules or tmps, etc.
  */
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 #define KERN_ATTR_BASIC()                 KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *g_bfs_buf,     void, void, void)
 #define KERN_ATTR_BITSLICE()              KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bs_word_t *g_words_buf_s, void, void, void)
 #define KERN_ATTR_ESALT(e)                KERN_ATTR (GLOBAL_AS,   GLOBAL_AS   const bf_t      *g_bfs_buf,     void, void, e)
diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl
index 768de504a..9265143c6 100644
--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@@ -2,6 +2,9 @@
  * Author......: See docs/credits.txt
  * License.....: MIT
  */
+#ifdef IS_HIP
+#include <hip_runtime.h>
+#endif
 
 #include "inc_vendor.h"
 #include "inc_types.h"
@@ -60,7 +63,7 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n)
 
 #endif
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 
 #if ATTACK_EXEC == 11
 
@@ -85,6 +88,7 @@ CONSTANT_VK u32 generic_constant[8192]; // 32k
 
 #endif
 
+
 DECLSPEC u32 atomic_dec (u32 *p)
 {
   return atomicSub (p, 1);
diff --git a/OpenCL/inc_platform.h b/OpenCL/inc_platform.h
index fdcf50fc1..422b29f4f 100644
--- a/OpenCL/inc_platform.h
+++ b/OpenCL/inc_platform.h
@@ -13,7 +13,7 @@ DECLSPEC u64  rotl64_S (const u64  a, const int n);
 DECLSPEC u64  rotr64_S (const u64  a, const int n);
 #endif
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 DECLSPEC u32    atomic_dec      (u32 *p);
 DECLSPEC u32    atomic_inc      (u32 *p);
 DECLSPEC u32    atomic_or       (u32 *p, u32 val);
@@ -30,7 +30,9 @@ DECLSPEC u64x rotr64   (const u64x a, const int n);
 DECLSPEC u64  rotl64_S (const u64  a, const int n);
 DECLSPEC u64  rotr64_S (const u64  a, const int n);
 
-//#define rotate(a,n) (((a) << (n)) | ((a) >> (32 - (n))))
+#ifdef IS_HIP
+#define rotate(a,n) (((a) << (n)) | ((a) >> (32 - (n))))
+#endif
 #define bitselect(a,b,c) ((a) ^ ((c) & ((b) ^ (a))))
 #endif
 
diff --git a/OpenCL/inc_types.h b/OpenCL/inc_types.h
index 9f0664263..8b3d1e05c 100644
--- a/OpenCL/inc_types.h
+++ b/OpenCL/inc_types.h
@@ -6,14 +6,15 @@
 #ifndef _INC_TYPES_H
 #define _INC_TYPES_H
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 //https://docs.nvidia.com/cuda/nvrtc/index.html#integer-size
 typedef unsigned char      uchar;
 typedef unsigned short     ushort;
 typedef unsigned int       uint;
-typedef unsigned long long ulong;
+typedef unsigned long long xulong;
 #endif
 
+
 #ifdef KERNEL_STATIC
 typedef uchar  u8;
 typedef ushort u16;
@@ -58,7 +59,7 @@ typedef u64  u64x;
 #define make_u64x (u64)
 
 #else
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 
 #if VECT_SIZE == 2
 
diff --git a/OpenCL/inc_vendor.h b/OpenCL/inc_vendor.h
index 6ca2c5707..de2d23866 100644
--- a/OpenCL/inc_vendor.h
+++ b/OpenCL/inc_vendor.h
@@ -10,6 +10,8 @@
 #define IS_NATIVE
 #elif defined __CUDACC__
 #define IS_CUDA
+#elif defined __HIPCC__
+#define IS_HIP
 #else
 #define IS_OPENCL
 #endif
@@ -21,7 +23,7 @@
 #define LOCAL_VK
 #define LOCAL_AS
 #define KERNEL_FQ
-#elif defined IS_CUDA
+#elif (defined IS_CUDA) || (defined IS_HIP)
 #define CONSTANT_VK __constant__
 #define CONSTANT_AS
 #define GLOBAL_AS
@@ -80,7 +82,9 @@
 #define IS_MESA
 #define IS_GENERIC
 #elif VENDOR_ID == (1 << 5)
-#define IS_NV
+//#define IS_NV //TODO: FIX ME HIP
+#define IS_POCL
+#define IS_GENERIC
 #elif VENDOR_ID == (1 << 6)
 #define IS_POCL
 #define IS_GENERIC
@@ -116,10 +120,14 @@
  */
 
 #if defined IS_AMD && defined IS_GPU
-#define DECLSPEC inline static
+#define DECLSPEC inline static __device__
+#else
+#ifdef IS_HIP
+#define DECLSPEC  __device__
 #else
 #define DECLSPEC
 #endif
+#endif
 
 /**
  * AMD specific
@@ -137,7 +145,7 @@
 // Whitelist some OpenCL specific functions
 // This could create more stable kernels on systems with bad OpenCL drivers
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 #define USE_BITSELECT
 #define USE_ROTATE
 #endif
diff --git a/OpenCL/m01700_a0-optimized.cl b/OpenCL/m01700_a0-optimized.cl
index 0d6ddb337..fc46cc9a9 100644
--- a/OpenCL/m01700_a0-optimized.cl
+++ b/OpenCL/m01700_a0-optimized.cl
@@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01700_a1-optimized.cl b/OpenCL/m01700_a1-optimized.cl
index abee4dfcb..2a0b4f6e8 100644
--- a/OpenCL/m01700_a1-optimized.cl
+++ b/OpenCL/m01700_a1-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01700_a3-optimized.cl b/OpenCL/m01700_a3-optimized.cl
index c4d8ee016..fc5ec06f9 100644
--- a/OpenCL/m01700_a3-optimized.cl
+++ b/OpenCL/m01700_a3-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01710_a0-optimized.cl b/OpenCL/m01710_a0-optimized.cl
index a5a53e831..8a14e3104 100644
--- a/OpenCL/m01710_a0-optimized.cl
+++ b/OpenCL/m01710_a0-optimized.cl
@@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01710_a1-optimized.cl b/OpenCL/m01710_a1-optimized.cl
index 931142cae..ac19e3fde 100644
--- a/OpenCL/m01710_a1-optimized.cl
+++ b/OpenCL/m01710_a1-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01710_a3-optimized.cl b/OpenCL/m01710_a3-optimized.cl
index a82f949ac..83d4afc87 100644
--- a/OpenCL/m01710_a3-optimized.cl
+++ b/OpenCL/m01710_a3-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01720_a0-optimized.cl b/OpenCL/m01720_a0-optimized.cl
index c331365f1..d40e66975 100644
--- a/OpenCL/m01720_a0-optimized.cl
+++ b/OpenCL/m01720_a0-optimized.cl
@@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01720_a1-optimized.cl b/OpenCL/m01720_a1-optimized.cl
index aa93dc2c9..9a9c319f2 100644
--- a/OpenCL/m01720_a1-optimized.cl
+++ b/OpenCL/m01720_a1-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01720_a3-optimized.cl b/OpenCL/m01720_a3-optimized.cl
index 891634dd4..a4cbfb4eb 100644
--- a/OpenCL/m01720_a3-optimized.cl
+++ b/OpenCL/m01720_a3-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01730_a0-optimized.cl b/OpenCL/m01730_a0-optimized.cl
index f5da15e7f..eef27b4f2 100644
--- a/OpenCL/m01730_a0-optimized.cl
+++ b/OpenCL/m01730_a0-optimized.cl
@@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01730_a1-optimized.cl b/OpenCL/m01730_a1-optimized.cl
index f3cd8d89a..e86df4229 100644
--- a/OpenCL/m01730_a1-optimized.cl
+++ b/OpenCL/m01730_a1-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01730_a3-optimized.cl b/OpenCL/m01730_a3-optimized.cl
index e00e5f4ae..c83e76a64 100644
--- a/OpenCL/m01730_a3-optimized.cl
+++ b/OpenCL/m01730_a3-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01740_a0-optimized.cl b/OpenCL/m01740_a0-optimized.cl
index ee38662e8..f877c2075 100644
--- a/OpenCL/m01740_a0-optimized.cl
+++ b/OpenCL/m01740_a0-optimized.cl
@@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01740_a1-optimized.cl b/OpenCL/m01740_a1-optimized.cl
index 0ae6984e0..3d400425b 100644
--- a/OpenCL/m01740_a1-optimized.cl
+++ b/OpenCL/m01740_a1-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m01740_a3-optimized.cl b/OpenCL/m01740_a3-optimized.cl
index 4b7b1d3df..ac56cb697 100644
--- a/OpenCL/m01740_a3-optimized.cl
+++ b/OpenCL/m01740_a3-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m02500-pure.cl b/OpenCL/m02500-pure.cl
index 95f97fb81..4b2459f78 100644
--- a/OpenCL/m02500-pure.cl
+++ b/OpenCL/m02500-pure.cl
@@ -681,7 +681,7 @@ KERNEL_FQ void m02500_aux3 (KERN_ATTR_TMPS_ESALT (wpa_pbkdf2_tmp_t, wpa_eapol_t)
     s_te4[i] = te4[i];
   }
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   __syncthreads();
   #else
   SYNC_THREADS ();
diff --git a/OpenCL/m08000_a0-optimized.cl b/OpenCL/m08000_a0-optimized.cl
index 310bebbeb..dabd57d3d 100644
--- a/OpenCL/m08000_a0-optimized.cl
+++ b/OpenCL/m08000_a0-optimized.cl
@@ -86,7 +86,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
@@ -143,7 +143,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
 
   ROUND_STEP_Z (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_STEP_Z (16);
   ROUND_STEP_Z (32);
   ROUND_STEP_Z (48);
diff --git a/OpenCL/m08000_a1-optimized.cl b/OpenCL/m08000_a1-optimized.cl
index 89ea42a57..b7a42e88e 100644
--- a/OpenCL/m08000_a1-optimized.cl
+++ b/OpenCL/m08000_a1-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
@@ -141,7 +141,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
 
   ROUND_STEP_Z (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_STEP_Z (16);
   ROUND_STEP_Z (32);
   ROUND_STEP_Z (48);
diff --git a/OpenCL/m08000_a3-optimized.cl b/OpenCL/m08000_a3-optimized.cl
index fa76a3b72..77bb3225d 100644
--- a/OpenCL/m08000_a3-optimized.cl
+++ b/OpenCL/m08000_a3-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
@@ -141,7 +141,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
 
   ROUND_STEP_Z (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_STEP_Z (16);
   ROUND_STEP_Z (32);
   ROUND_STEP_Z (48);
diff --git a/OpenCL/m08900-pure.cl b/OpenCL/m08900-pure.cl
index 706f7f2e4..f5e607534 100644
--- a/OpenCL/m08900-pure.cl
+++ b/OpenCL/m08900-pure.cl
@@ -24,7 +24,7 @@ typedef struct
 
 } scrypt_tmp_t;
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 
 inline __device__ uint4 operator &  (const uint4  a, const u32   b) { return make_uint4 ((a.x &  b  ), (a.y &  b  ), (a.z &  b  ), (a.w &  b  ));  }
 inline __device__ uint4 operator << (const uint4  a, const u32   b) { return make_uint4 ((a.x << b  ), (a.y << b  ), (a.z << b  ), (a.w << b  ));  }
@@ -57,7 +57,7 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
 
 #define ADD_ROTATE_XOR(r,i1,i2,s) (r) ^= rotate ((i1) + (i2), (s));
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 
 #define SALSA20_2R()                        \
 {                                           \
@@ -205,7 +205,7 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
   #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
-    #ifdef IS_CUDA
+    #if defined IS_CUDA || defined IS_HIP
     T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
     T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
     T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
@@ -252,7 +252,7 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
   #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
-    #ifdef IS_CUDA
+    #if defined IS_CUDA || defined IS_HIP
     T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
     T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
     T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
@@ -328,7 +328,7 @@ KERNEL_FQ void m08900_init (KERN_ATTR_TMPS (scrypt_tmp_t))
     digest[6] = sha256_hmac_ctx2.opad.h[6];
     digest[7] = sha256_hmac_ctx2.opad.h[7];
 
-    #ifdef IS_CUDA
+    #if defined IS_CUDA || defined IS_HIP
     const uint4 tmp0 = make_uint4 (digest[0], digest[1], digest[2], digest[3]);
     const uint4 tmp1 = make_uint4 (digest[4], digest[5], digest[6], digest[7]);
     #else
diff --git a/OpenCL/m10800_a0-optimized.cl b/OpenCL/m10800_a0-optimized.cl
index 396b389a6..4f350a2c7 100644
--- a/OpenCL/m10800_a0-optimized.cl
+++ b/OpenCL/m10800_a0-optimized.cl
@@ -86,7 +86,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m10800_a1-optimized.cl b/OpenCL/m10800_a1-optimized.cl
index 11aa95dbd..8e985263a 100644
--- a/OpenCL/m10800_a1-optimized.cl
+++ b/OpenCL/m10800_a1-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m10800_a3-optimized.cl b/OpenCL/m10800_a3-optimized.cl
index cef22d51f..a548aad58 100644
--- a/OpenCL/m10800_a3-optimized.cl
+++ b/OpenCL/m10800_a3-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m15700-pure.cl b/OpenCL/m15700-pure.cl
index 3c54850a4..13c8724c7 100644
--- a/OpenCL/m15700-pure.cl
+++ b/OpenCL/m15700-pure.cl
@@ -24,7 +24,7 @@ typedef struct
 
 } scrypt_tmp_t;
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 
 inline __device__ uint4 operator &  (const uint4  a, const u32   b) { return make_uint4 ((a.x &  b  ), (a.y &  b  ), (a.z &  b  ), (a.w &  b  ));  }
 inline __device__ uint4 operator << (const uint4  a, const u32   b) { return make_uint4 ((a.x << b  ), (a.y << b  ), (a.z << b  ), (a.w << b  ));  }
@@ -64,7 +64,7 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
 
 #define ADD_ROTATE_XOR(r,i1,i2,s) (r) ^= rotate ((i1) + (i2), (s));
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 
 #define SALSA20_2R()                        \
 {                                           \
@@ -212,7 +212,7 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
   #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
-    #ifdef IS_CUDA
+    #if defined IS_CUDA || defined IS_HIP
     T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
     T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
     T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
@@ -259,7 +259,7 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
   #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
-    #ifdef IS_CUDA
+    #if defined IS_CUDA || defined IS_HIP
     T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
     T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
     T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
@@ -464,7 +464,7 @@ KERNEL_FQ void m15700_init (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
     digest[6] = sha256_hmac_ctx2.opad.h[6];
     digest[7] = sha256_hmac_ctx2.opad.h[7];
 
-    #ifdef IS_CUDA
+    #if defined IS_CUDA || defined IS_HIP
     const uint4 tmp0 = make_uint4 (digest[0], digest[1], digest[2], digest[3]);
     const uint4 tmp1 = make_uint4 (digest[4], digest[5], digest[6], digest[7]);
     #else
diff --git a/OpenCL/m21000_a0-optimized.cl b/OpenCL/m21000_a0-optimized.cl
index c7cfa5b8d..36ad9972f 100644
--- a/OpenCL/m21000_a0-optimized.cl
+++ b/OpenCL/m21000_a0-optimized.cl
@@ -89,7 +89,7 @@ DECLSPEC void sha512_transform_opt (const u32x *w0, const u32x *w1, const u32x *
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m21000_a1-optimized.cl b/OpenCL/m21000_a1-optimized.cl
index 7ff4577f7..f2beb1629 100644
--- a/OpenCL/m21000_a1-optimized.cl
+++ b/OpenCL/m21000_a1-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_full (const u32x *w0, const u32x *w1, const u32x
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
@@ -182,7 +182,7 @@ DECLSPEC void sha512_transform_opt (const u32x *w0, const u32x *w1, const u32x *
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m21000_a3-optimized.cl b/OpenCL/m21000_a3-optimized.cl
index 768ba9e02..757a87c8a 100644
--- a/OpenCL/m21000_a3-optimized.cl
+++ b/OpenCL/m21000_a3-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_full (const u32x *w0, const u32x *w1, const u32x
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
@@ -182,7 +182,7 @@ DECLSPEC void sha512_transform_opt (const u32x *w0, const u32x *w1, const u32x *
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m22000-pure.cl b/OpenCL/m22000-pure.cl
index 954f62ce3..816a52458 100644
--- a/OpenCL/m22000-pure.cl
+++ b/OpenCL/m22000-pure.cl
@@ -703,7 +703,7 @@ KERNEL_FQ void m22000_aux3 (KERN_ATTR_TMPS_ESALT (wpa_pbkdf2_tmp_t, wpa_t))
     s_te4[i] = te4[i];
   }
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   __syncthreads();
   #else
   SYNC_THREADS ();
diff --git a/OpenCL/m22001-pure.cl b/OpenCL/m22001-pure.cl
index e3a9d23f9..20c962313 100644
--- a/OpenCL/m22001-pure.cl
+++ b/OpenCL/m22001-pure.cl
@@ -610,7 +610,7 @@ KERNEL_FQ void m22001_aux3 (KERN_ATTR_TMPS_ESALT (wpa_pmk_tmp_t, wpa_t))
     s_te4[i] = te4[i];
   }
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   __syncthreads();
   #else
   SYNC_THREADS ();
diff --git a/OpenCL/m22200_a0-optimized.cl b/OpenCL/m22200_a0-optimized.cl
index 8c0e51b03..528222fe1 100644
--- a/OpenCL/m22200_a0-optimized.cl
+++ b/OpenCL/m22200_a0-optimized.cl
@@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m22200_a1-optimized.cl b/OpenCL/m22200_a1-optimized.cl
index 39ca46c20..3fa91b5a8 100644
--- a/OpenCL/m22200_a1-optimized.cl
+++ b/OpenCL/m22200_a1-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m22200_a3-optimized.cl b/OpenCL/m22200_a3-optimized.cl
index c04f8c8c4..f620cca46 100644
--- a/OpenCL/m22200_a3-optimized.cl
+++ b/OpenCL/m22200_a3-optimized.cl
@@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
-  #ifdef IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   ROUND_EXPAND (); ROUND_STEP (16);
   ROUND_EXPAND (); ROUND_STEP (32);
   ROUND_EXPAND (); ROUND_STEP (48);
diff --git a/OpenCL/m22700-pure.cl b/OpenCL/m22700-pure.cl
index f5ec90de5..0f5b84a4c 100644
--- a/OpenCL/m22700-pure.cl
+++ b/OpenCL/m22700-pure.cl
@@ -72,7 +72,7 @@ DECLSPEC int is_valid_bitcoinj (const u32 *w)
   return 1;
 }
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 
 inline __device__ uint4 operator &  (const uint4  a, const u32   b) { return make_uint4 ((a.x &  b  ), (a.y &  b  ), (a.z &  b  ), (a.w &  b  ));  }
 inline __device__ uint4 operator << (const uint4  a, const u32   b) { return make_uint4 ((a.x << b  ), (a.y << b  ), (a.z << b  ), (a.w << b  ));  }
@@ -105,7 +105,7 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
 
 #define ADD_ROTATE_XOR(r,i1,i2,s) (r) ^= rotate ((i1) + (i2), (s));
 
-#ifdef IS_CUDA
+#if defined IS_CUDA || defined IS_HIP
 
 #define SALSA20_2R()                        \
 {                                           \
@@ -253,7 +253,7 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
   #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
-    #ifdef IS_CUDA
+    #if defined IS_CUDA || defined IS_HIP
     T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
     T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
     T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
@@ -300,7 +300,7 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
   #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
-    #ifdef IS_CUDA
+    #if defined IS_CUDA || defined IS_HIP
     T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
     T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
     T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
@@ -416,7 +416,7 @@ KERNEL_FQ void m22700_init (KERN_ATTR_TMPS (scrypt_tmp_t))
     digest[6] = sha256_hmac_ctx2.opad.h[6];
     digest[7] = sha256_hmac_ctx2.opad.h[7];
 
-    #ifdef IS_CUDA
+    #if defined IS_CUDA || defined IS_HIP
     const uint4 tmp0 = make_uint4 (digest[0], digest[1], digest[2], digest[3]);
     const uint4 tmp1 = make_uint4 (digest[4], digest[5], digest[6], digest[7]);
     #else
diff --git a/include/backend.h b/include/backend.h
index 920f015cf..5a3aa990c 100644
--- a/include/backend.h
+++ b/include/backend.h
@@ -28,6 +28,12 @@ void cuda_close  (hashcat_ctx_t *hashcat_ctx);
 int  nvrtc_init  (hashcat_ctx_t *hashcat_ctx);
 void nvrtc_close (hashcat_ctx_t *hashcat_ctx);
 
+int  hip_init    (hashcat_ctx_t *hashcat_ctx);
+void hip_close   (hashcat_ctx_t *hashcat_ctx);
+
+int  hiprtc_init  (hashcat_ctx_t *hashcat_ctx);
+void hiprtc_close (hashcat_ctx_t *hashcat_ctx);
+
 int  ocl_init    (hashcat_ctx_t *hashcat_ctx);
 void ocl_close   (hashcat_ctx_t *hashcat_ctx);
 
@@ -79,6 +85,56 @@ int hc_cuLinkAddData             (hashcat_ctx_t *hashcat_ctx, CUlinkState state,
 int hc_cuLinkDestroy             (hashcat_ctx_t *hashcat_ctx, CUlinkState state);
 int hc_cuLinkComplete            (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut);
 
+
+int hc_hiprtcCreateProgram        (hashcat_ctx_t *hashcat_ctx, hiprtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames);
+int hc_hiprtcDestroyProgram       (hashcat_ctx_t *hashcat_ctx, hiprtcProgram *prog);
+int hc_hiprtcCompileProgram       (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, int numOptions, const char * const *options);
+int hc_hiprtcGetProgramLogSize    (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, size_t *logSizeRet);
+int hc_hiprtcGetProgramLog        (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, char *log);
+int hc_hiprtcGetCodeSize          (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, size_t *ptxSizeRet);
+int hc_hiprtcGetCode              (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, char *ptx);
+int hc_hiprtcVersion              (hashcat_ctx_t *hashcat_ctx, int *major, int *minor);
+
+int hc_hipCtxCreate               (hashcat_ctx_t *hashcat_ctx, HIPcontext *pctx, unsigned int flags, HIPdevice dev);
+int hc_hipCtxDestroy              (hashcat_ctx_t *hashcat_ctx, HIPcontext ctx);
+int hc_hipCtxSetCurrent           (hashcat_ctx_t *hashcat_ctx, HIPcontext ctx);
+int hc_hipCtxSetCacheConfig       (hashcat_ctx_t *hashcat_ctx, HIPfunc_cache config);
+int hc_hipCtxSynchronize          (hashcat_ctx_t *hashcat_ctx);
+int hc_hipDeviceGetAttribute      (hashcat_ctx_t *hashcat_ctx, int *pi, HIPdevice_attribute attrib, HIPdevice dev);
+int hc_hipDeviceGetCount          (hashcat_ctx_t *hashcat_ctx, int *count);
+int hc_hipDeviceGet               (hashcat_ctx_t *hashcat_ctx, HIPdevice *device, int ordinal);
+int hc_hipDeviceGetName           (hashcat_ctx_t *hashcat_ctx, char *name, int len, HIPdevice dev);
+int hc_hipDeviceTotalMem          (hashcat_ctx_t *hashcat_ctx, size_t *bytes, HIPdevice dev);
+int hc_hipDriverGetVersion        (hashcat_ctx_t *hashcat_ctx, int *driverVersion);
+int hc_hipEventCreate             (hashcat_ctx_t *hashcat_ctx, HIPevent *phEvent, unsigned int Flags);
+int hc_hipEventDestroy            (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent);
+int hc_hipEventElapsedTime        (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, HIPevent hStart, HIPevent hEnd);
+int hc_hipEventQuery              (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent);
+int hc_hipEventRecord             (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent, HIPstream hStream);
+int hc_hipEventSynchronize        (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent);
+int hc_hipFuncGetAttribute        (hashcat_ctx_t *hashcat_ctx, int *pi, HIPfunction_attribute attrib, HIPfunction hfunc);
+int hc_hipFuncSetAttribute        (hashcat_ctx_t *hashcat_ctx, HIPfunction hfunc, HIPfunction_attribute attrib, int value);
+int hc_hipInit                    (hashcat_ctx_t *hashcat_ctx, unsigned int Flags);
+int hc_hipLaunchKernel            (hashcat_ctx_t *hashcat_ctx, HIPfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, HIPstream hStream, void **kernelParams, void **extra);
+int hc_hipMemAlloc                (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr *dptr, size_t bytesize);
+int hc_hipMemcpyDtoD              (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr dstDevice, HIPdeviceptr srcDevice, size_t ByteCount);
+int hc_hipMemcpyDtoH              (hashcat_ctx_t *hashcat_ctx, void *dstHost, HIPdeviceptr srcDevice, size_t ByteCount);
+int hc_hipMemcpyHtoD              (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+int hc_hipMemFree                 (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr dptr);
+int hc_hipModuleGetFunction       (hashcat_ctx_t *hashcat_ctx, HIPfunction *hfunc, HIPmodule hmod, const char *name);
+int hc_hipModuleLoadDataEx        (hashcat_ctx_t *hashcat_ctx, HIPmodule *module, const void *image, unsigned int numOptions, HIPjit_option *options, void **optionValues);
+int hc_hipModuleUnload            (hashcat_ctx_t *hashcat_ctx, HIPmodule hmod);
+int hc_hipStreamCreate            (hashcat_ctx_t *hashcat_ctx, HIPstream *phStream, unsigned int Flags);
+int hc_hipStreamDestroy           (hashcat_ctx_t *hashcat_ctx, HIPstream hStream);
+int hc_hipStreamSynchronize       (hashcat_ctx_t *hashcat_ctx, HIPstream hStream);
+int hc_hipCtxPushCurrent          (hashcat_ctx_t *hashcat_ctx, HIPcontext ctx);
+int hc_hipCtxPopCurrent           (hashcat_ctx_t *hashcat_ctx, HIPcontext *pctx);
+int hc_hipLinkCreate              (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, HIPjit_option *options, void **optionValues, HIPlinkState *stateOut);
+int hc_hipLinkAddData             (hashcat_ctx_t *hashcat_ctx, HIPlinkState state, HIPjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, HIPjit_option *options, void **optionValues);
+int hc_hipLinkDestroy             (hashcat_ctx_t *hashcat_ctx, HIPlinkState state);
+int hc_hipLinkComplete            (hashcat_ctx_t *hashcat_ctx, HIPlinkState state, void **hipbinOut, size_t *sizeOut);
+
+
 int hc_clBuildProgram            (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data);
 int hc_clCreateBuffer            (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_mem *mem);
 int hc_clCreateCommandQueue      (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_device_id device, cl_command_queue_properties properties, cl_command_queue *command_queue);
@@ -122,6 +178,10 @@ int run_cuda_kernel_atinit    (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *de
 int run_cuda_kernel_memset    (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u32 value, const u64 size);
 int run_cuda_kernel_bzero     (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 size);
 
+int run_hip_kernel_atinit    (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u64 num);
+int run_hip_kernel_memset    (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u32 value, const u64 size);
+int run_hip_kernel_bzero     (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u64 size);
+
 int run_opencl_kernel_atinit  (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num);
 int run_opencl_kernel_memset  (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size);
 int run_opencl_kernel_bzero   (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size);
diff --git a/include/ext_hip.h b/include/ext_hip.h
new file mode 100644
index 000000000..15840d671
--- /dev/null
+++ b/include/ext_hip.h
@@ -0,0 +1,1131 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#ifndef _EXT_HIP_H
+#define _EXT_HIP_H
+
+/**
+ * TODO: FIX ME
+ */
+
+#define __HIP_API_VERSION 4221131
+
+/**
+ * HIP device pointer
+ * HIPdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+ */
+#if __HIP_API_VERSION >= 3020
+
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long HIPdeviceptr;
+#else
+typedef unsigned int HIPdeviceptr;
+#endif
+
+#endif /* __HIP_API_VERSION >= 3020 */
+
+typedef int HIPdevice;                                     /**< HIP device */
+typedef struct HIPctx_st *HIPcontext;                       /**< HIP context */
+typedef struct HIPevent_st *HIPevent;                       /**< HIP event */
+typedef struct HIPfunc_st *HIPfunction;                     /**< HIP function */
+typedef struct HIPmod_st *HIPmodule;                        /**< HIP module */
+typedef struct HIPstream_st *HIPstream;                     /**< HIP stream */
+typedef struct HIPlinkState_st *HIPlinkState;
+
+
+typedef enum hipError_enum {
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * also means that the operation being queried is complete (see
+     * ::hipEventQuery() and ::hipStreamQuery()).
+     */
+    HIP_SUCCESS                              = 0,
+
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    HIP_ERROR_INVALID_VALUE                  = 1,
+
+    /**
+     * The API call failed because it was unable to allocate enough memory to
+     * perform the requested operation.
+     */
+    HIP_ERROR_OUT_OF_MEMORY                  = 2,
+
+    /**
+     * This indicates that the HIP driver has not been initialized with
+     * ::hipInit() or that initialization has failed.
+     */
+    HIP_ERROR_NOT_INITIALIZED                = 3,
+
+    /**
+     * This indicates that the HIP driver is in the process of shutting down.
+     */
+    HIP_ERROR_DEINITIALIZED                  = 4,
+
+    /**
+     * This indicates profiler is not initialized for this run. This can
+     * happen when the application is running with external profiling tools
+     * like visual profiler.
+     */
+    HIP_ERROR_PROFILER_DISABLED              = 5,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of HIP 5.0. It is no longer an error
+     * to attempt to enable/disable the profiling via ::hipProfilerStart or
+     * ::hipProfilerStop without initialization.
+     */
+    HIP_ERROR_PROFILER_NOT_INITIALIZED       = 6,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of HIP 5.0. It is no longer an error
+     * to call hipProfilerStart() when profiling is already enabled.
+     */
+    HIP_ERROR_PROFILER_ALREADY_STARTED       = 7,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of HIP 5.0. It is no longer an error
+     * to call hipProfilerStop() when profiling is already disabled.
+     */
+    HIP_ERROR_PROFILER_ALREADY_STOPPED       = 8,
+
+    /**
+     * This indicates that no HIP-capable devices were detected by the installed
+     * HIP driver.
+     */
+    HIP_ERROR_NO_DEVICE                      = 100,
+
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid HIP device.
+     */
+    HIP_ERROR_INVALID_DEVICE                 = 101,
+
+
+    /**
+     * This indicates that the device kernel image is invalid. This can also
+     * indicate an invalid HIP module.
+     */
+    HIP_ERROR_INVALID_IMAGE                  = 200,
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * hiprrent thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::hipCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::hipCtxGetApiVersion() for more details.
+     */
+    HIP_ERROR_INVALID_CONTEXT                = 201,
+
+    /**
+     * This indicated that the context being supplied as a parameter to the
+     * API call was already the active context.
+     * \deprecated
+     * This error return is deprecated as of HIP 3.2. It is no longer an
+     * error to attempt to push the active context via ::hipCtxPushCurrent().
+     */
+    HIP_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
+
+    /**
+     * This indicates that a map or register operation has failed.
+     */
+    HIP_ERROR_MAP_FAILED                     = 205,
+
+    /**
+     * This indicates that an unmap or unregister operation has failed.
+     */
+    HIP_ERROR_UNMAP_FAILED                   = 206,
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    HIP_ERROR_ARRAY_IS_MAPPED                = 207,
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    HIP_ERROR_ALREADY_MAPPED                 = 208,
+
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular HIP source file that do not include the
+     * corresponding device configuration.
+     */
+    HIP_ERROR_NO_BINARY_FOR_GPU              = 209,
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    HIP_ERROR_ALREADY_ACQUIRED               = 210,
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    HIP_ERROR_NOT_MAPPED                     = 211,
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    HIP_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    HIP_ERROR_NOT_MAPPED_AS_POINTER          = 213,
+
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    HIP_ERROR_ECC_UNCORRECTABLE              = 214,
+
+    /**
+     * This indicates that the ::HIPlimit passed to the API call is not
+     * supported by the active device.
+     */
+    HIP_ERROR_UNSUPPORTED_LIMIT              = 215,
+
+    /**
+     * This indicates that the ::HIPcontext passed to the API call can
+     * only be bound to a single CPU thread at a time but is already
+     * bound to a CPU thread.
+     */
+    HIP_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
+
+    /**
+     * This indicates that peer access is not supported across the given
+     * devices.
+     */
+    HIP_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,
+
+    /**
+     * This indicates that a PTX JIT compilation failed.
+     */
+    HIP_ERROR_INVALID_PTX                    = 218,
+
+    /**
+     * This indicates an error with OpenGL or DirectX context.
+     */
+    HIP_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,
+
+    /**
+    * This indicates that an uncorrectable NVLink error was detected during the
+    * execution.
+    */
+    HIP_ERROR_NVLINK_UNCORRECTABLE           = 220,
+
+    /**
+    * This indicates that the PTX JIT compiler library was not found.
+    */
+    HIP_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
+
+    /**
+     * This indicates that the device kernel source is invalid.
+     */
+    HIP_ERROR_INVALID_SOURCE                 = 300,
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    HIP_ERROR_FILE_NOT_FOUND                 = 301,
+
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    HIP_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
+
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    HIP_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
+
+    /**
+     * This indicates that an OS call failed.
+     */
+    HIP_ERROR_OPERATING_SYSTEM               = 304,
+
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::HIPstream and ::HIPevent.
+     */
+    HIP_ERROR_INVALID_HANDLE                 = 400,
+
+    /**
+     * This indicates that a resource required by the API call is not in a
+     * valid state to perform the requested operation.
+     */
+    HIP_ERROR_ILLEGAL_STATE                  = 401,
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, texture names, and surface names.
+     */
+    HIP_ERROR_NOT_FOUND                      = 500,
+
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::HIP_SUCCESS (which indicates completion). Calls that
+     * may return this value include ::hipEventQuery() and ::hipStreamQuery().
+     */
+    HIP_ERROR_NOT_READY                      = 600,
+
+    /**
+     * While executing a kernel, the device encountered a
+     * load or store instruction on an invalid memory address.
+     * This leaves the process in an inconsistent state and any further HIP work
+     * will return the same error. To continue using HIP, the process must be terminated
+     * and relaunched.
+     */
+    HIP_ERROR_ILLEGAL_ADDRESS                = 700,
+
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. This error usually indicates that the user has
+     * attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register
+     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+     * when a 32-bit int is expected) is equivalent to passing too many
+     * arguments and can also result in this error.
+     */
+    HIP_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
+
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device attribute
+     * ::HIP_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
+     * This leaves the process in an inconsistent state and any further HIP work
+     * will return the same error. To continue using HIP, the process must be terminated
+     * and relaunched.
+     */
+    HIP_ERROR_LAUNCH_TIMEOUT                 = 702,
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    HIP_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
+
+    /**
+     * This error indicates that a call to ::hipCtxEnablePeerAccess() is
+     * trying to re-enable peer access to a context which has already
+     * had peer access to it enabled.
+     */
+    HIP_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,
+
+    /**
+     * This error indicates that ::hipCtxDisablePeerAccess() is
+     * trying to disable peer access which has not been enabled yet
+     * via ::hipCtxEnablePeerAccess().
+     */
+    HIP_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,
+
+    /**
+     * This error indicates that the primary context for the specified device
+     * has already been initialized.
+     */
+    HIP_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
+
+    /**
+     * This error indicates that the context hiprrent to the calling thread
+     * has been destroyed using ::hipCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    HIP_ERROR_CONTEXT_IS_DESTROYED           = 709,
+
+    /**
+     * A device-side assert triggered during kernel execution. The context
+     * cannot be used anymore, and must be destroyed. All existing device
+     * memory allocations from this context are invalid and must be
+     * reconstructed if the program is to continue using HIP.
+     */
+    HIP_ERROR_ASSERT                         = 710,
+
+    /**
+     * This error indicates that the hardware resources required to enable
+     * peer access have been exhausted for one or more of the devices
+     * passed to ::hipCtxEnablePeerAccess().
+     */
+    HIP_ERROR_TOO_MANY_PEERS                 = 711,
+
+    /**
+     * This error indicates that the memory range passed to ::hipMemHostRegister()
+     * has already been registered.
+     */
+    HIP_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
+
+    /**
+     * This error indicates that the pointer passed to ::hipMemHostUnregister()
+     * does not correspond to any currently registered memory region.
+     */
+    HIP_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,
+
+    /**
+     * While executing a kernel, the device encountered a stack error.
+     * This can be due to stack corruption or exceeding the stack size limit.
+     * This leaves the process in an inconsistent state and any further HIP work
+     * will return the same error. To continue using HIP, the process must be terminated
+     * and relaunched.
+     */
+    HIP_ERROR_HARDWARE_STACK_ERROR           = 714,
+
+    /**
+     * While executing a kernel, the device encountered an illegal instruction.
+     * This leaves the process in an inconsistent state and any further HIP work
+     * will return the same error. To continue using HIP, the process must be terminated
+     * and relaunched.
+     */
+    HIP_ERROR_ILLEGAL_INSTRUCTION            = 715,
+
+    /**
+     * While executing a kernel, the device encountered a load or store instruction
+     * on a memory address which is not aligned.
+     * This leaves the process in an inconsistent state and any further HIP work
+     * will return the same error. To continue using HIP, the process must be terminated
+     * and relaunched.
+     */
+    HIP_ERROR_MISALIGNED_ADDRESS             = 716,
+
+    /**
+     * While executing a kernel, the device encountered an instruction
+     * which can only operate on memory locations in certain address spaces
+     * (global, shared, or local), but was supplied a memory address not
+     * belonging to an allowed address space.
+     * This leaves the process in an inconsistent state and any further HIP work
+     * will return the same error. To continue using HIP, the process must be terminated
+     * and relaunched.
+     */
+    HIP_ERROR_INVALID_ADDRESS_SPACE          = 717,
+
+    /**
+     * While executing a kernel, the device program counter wrapped its address space.
+     * This leaves the process in an inconsistent state and any further HIP work
+     * will return the same error. To continue using HIP, the process must be terminated
+     * and relaunched.
+     */
+    HIP_ERROR_INVALID_PC                     = 718,
+
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. Less common cases can be system specific - more
+     * information about these cases can be found in the system specific user guide.
+     * This leaves the process in an inconsistent state and any further HIP work
+     * will return the same error. To continue using HIP, the process must be terminated
+     * and relaunched.
+     */
+    HIP_ERROR_LAUNCH_FAILED                  = 719,
+
+    /**
+     * This error indicates that the number of blocks launched per grid for a kernel that was
+     * launched via either ::hipLaunchCooperativeKernel or ::hipLaunchCooperativeKernelMultiDevice
+     * exceeds the maximum number of blocks as allowed by ::hipOccupancyMaxActiveBlocksPerMultiprocessor
+     * or ::hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+     * as specified by the device attribute ::HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+     */
+    HIP_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
+
+    /**
+     * This error indicates that the attempted operation is not permitted.
+     */
+    HIP_ERROR_NOT_PERMITTED                  = 800,
+
+    /**
+     * This error indicates that the attempted operation is not supported
+     * on the current system or device.
+     */
+    HIP_ERROR_NOT_SUPPORTED                  = 801,
+
+    /**
+     * This error indicates that the system is not yet ready to start any HIP
+     * work.  To continue using HIP, verify the system configuration is in a
+     * valid state and all required driver daemons are actively running.
+     * More information about this error can be found in the system specific
+     * user guide.
+     */
+    HIP_ERROR_SYSTEM_NOT_READY               = 802,
+
+    /**
+     * This error indicates that there is a mismatch between the versions of
+     * the display driver and the HIP driver. Refer to the compatibility documentation
+     * for supported versions.
+     */
+    HIP_ERROR_SYSTEM_DRIVER_MISMATCH         = 803,
+
+    /**
+     * This error indicates that the system was upgraded to run with forward compatibility
+     * but the visible hardware detected by HIP does not support this configuration.
+     * Refer to the compatibility documentation for the supported hardware matrix or ensure
+     * that only supported hardware is visible during initialization via the HIP_VISIBLE_DEVICES
+     * environment variable.
+     */
+    HIP_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
+
+    /**
+     * This error indicates that the operation is not permitted when
+     * the stream is capturing.
+     */
+    HIP_ERROR_STREAM_CAPTURE_UNSUPPORTED     = 900,
+
+    /**
+     * This error indicates that the current capture sequence on the stream
+     * has been invalidated due to a previous error.
+     */
+    HIP_ERROR_STREAM_CAPTURE_INVALIDATED     = 901,
+
+    /**
+     * This error indicates that the operation would have resulted in a merge
+     * of two independent capture sequences.
+     */
+    HIP_ERROR_STREAM_CAPTURE_MERGE           = 902,
+
+    /**
+     * This error indicates that the capture was not initiated in this stream.
+     */
+    HIP_ERROR_STREAM_CAPTURE_UNMATCHED       = 903,
+
+    /**
+     * This error indicates that the capture sequence contains a fork that was
+     * not joined to the primary stream.
+     */
+    HIP_ERROR_STREAM_CAPTURE_UNJOINED        = 904,
+
+    /**
+     * This error indicates that a dependency would have been created which
+     * crosses the capture sequence boundary. Only implicit in-stream ordering
+     * dependencies are allowed to cross the boundary.
+     */
+    HIP_ERROR_STREAM_CAPTURE_ISOLATION       = 905,
+
+    /**
+     * This error indicates a disallowed implicit dependency on a current capture
+     * sequence from HIPStreamLegacy.
+     */
+    HIP_ERROR_STREAM_CAPTURE_IMPLICIT        = 906,
+
+    /**
+     * This error indicates that the operation is not permitted on an event which
+     * was last recorded in a capturing stream.
+     */
+    HIP_ERROR_CAPTURED_EVENT                 = 907,
+
+    /**
+     * A stream capture sequence not initiated with the ::HIP_STREAM_CAPTURE_MODE_RELAXED
+     * argument to ::HIPStreamBeginCapture was passed to ::hipStreamEndCapture in a
+     * different thread.
+     */
+    HIP_ERROR_STREAM_CAPTURE_WRONG_THREAD    = 908,
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    HIP_ERROR_UNKNOWN                        = 999
+} HIPresult;
+
+/**
+ * Online compiler and linker options
+ */
+typedef enum HIPjit_option_enum
+{
+    /**
+     * Max number of registers that a thread may use.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    HIP_JIT_MAX_REGISTERS = 0,
+
+    /**
+     * IN: Specifies minimum number of threads per block to target compilation
+     * for\n
+     * OUT: Returns the number of threads the compiler actually targeted.
+     * This restricts the resource utilization fo the compiler (e.g. max
+     * registers) such that a block with the given number of threads should be
+     * able to launch based on register limitations. Note, this option does not
+     * currently take into account any other resource limitations, such as
+     * shared memory utilization.\n
+     * Cannot be combined with ::HIP_JIT_TARGET.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    HIP_JIT_THREADS_PER_BLOCK,
+
+    /**
+     * Overwrites the option value with the total wall clock time, in
+     * milliseconds, spent in the compiler and linker\n
+     * Option type: float\n
+     * Applies to: compiler and linker
+     */
+    HIP_JIT_WALL_TIME,
+
+    /**
+     * Pointer to a buffer in which to print any log messages
+     * that are informational in nature (the buffer size is specified via
+     * option ::HIP_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    HIP_JIT_INFO_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    HIP_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Pointer to a buffer in which to print any log messages that
+     * reflect errors (the buffer size is specified via option
+     * ::HIP_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    HIP_JIT_ERROR_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    HIP_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Level of optimizations to apply to generated code (0 - 4), with 4
+     * being the default and highest level of optimizations.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    HIP_JIT_OPTIMIZATION_LEVEL,
+
+    /**
+     * No option value required. Determines the target based on the current
+     * attached context (default)\n
+     * Option type: No option value needed\n
+     * Applies to: compiler and linker
+     */
+    HIP_JIT_TARGET_FROM_HIPCONTEXT,
+
+    /**
+     * Target is chosen based on supplied ::HIPjit_target.  Cannot be
+     * combined with ::HIP_JIT_THREADS_PER_BLOCK.\n
+     * Option type: unsigned int for enumerated type ::HIPjit_target\n
+     * Applies to: compiler and linker
+     */
+    HIP_JIT_TARGET,
+
+    /**
+     * Specifies choice of fallback strategy if matching HIPbin is not found.
+     * Choice is based on supplied ::HIPjit_fallback.  This option cannot be
+     * used with HIPLink* APIs as the linker requires exact matches.\n
+     * Option type: unsigned int for enumerated type ::HIPjit_fallback\n
+     * Applies to: compiler only
+     */
+    HIP_JIT_FALLBACK_STRATEGY,
+
+    /**
+     * Specifies whether to create debug information in output (-g)
+     * (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    HIP_JIT_GENERATE_DEBUG_INFO,
+
+    /**
+     * Generate verbose log messages (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    HIP_JIT_LOG_VERBOSE,
+
+    /**
+     * Generate line number information (-lineinfo) (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler only
+     */
+    HIP_JIT_GENERATE_LINE_INFO,
+
+    /**
+     * Specifies whether to enable caching explicitly (-dlcm) \n
+     * Choice is based on supplied ::HIPjit_cacheMode_enum.\n
+     * Option type: unsigned int for enumerated type ::HIPjit_cacheMode_enum\n
+     * Applies to: compiler only
+     */
+    HIP_JIT_CACHE_MODE,
+
+    /**
+     * The below jit options are used for internal purposes only, in this version of HIP
+     */
+    HIP_JIT_NEW_SM3X_OPT,
+    HIP_JIT_FAST_COMPILE,
+
+    /**
+     * Array of device symbol names that will be relocated to the corresponing
+     * host addresses stored in ::HIP_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
+     * Must contain ::HIP_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * When loding a device module, driver will relocate all encountered
+     * unresolved symbols to the host addresses.\n
+     * It is only allowed to register symbols that correspond to unresolved
+     * global variables.\n
+     * It is illegal to register the same device symbol at multiple addresses.\n
+     * Option type: const char **\n
+     * Applies to: dynamic linker only
+     */
+    HIP_JIT_GLOBAL_SYMBOL_NAMES,
+
+    /**
+     * Array of host addresses that will be used to relocate corresponding
+     * device symbols stored in ::HIP_JIT_GLOBAL_SYMBOL_NAMES.\n
+     * Must contain ::HIP_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * Option type: void **\n
+     * Applies to: dynamic linker only
+     */
+    HIP_JIT_GLOBAL_SYMBOL_ADDRESSES,
+
+    /**
+     * Number of entries in ::HIP_JIT_GLOBAL_SYMBOL_NAMES and
+     * ::HIP_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
+     * Option type: unsigned int\n
+     * Applies to: dynamic linker only
+     */
+    HIP_JIT_GLOBAL_SYMBOL_COUNT,
+
+    HIP_JIT_NUM_OPTIONS
+
+} HIPjit_option;
+
+
+/**
+ * Device properties
+ */
+typedef enum HIPdevice_attribute_enum {
+    
+    HIP_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,              /**< Maximum number of threads per block */
+    HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 1,                    /**< Maximum block dimension X */
+    HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 2,                    /**< Maximum block dimension Y */
+    HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 3,                    /**< Maximum block dimension Z */
+    HIP_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 4,                     /**< Maximum grid dimension X */
+    HIP_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 5,                     /**< Maximum grid dimension Y */
+    HIP_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 6,                     /**< Maximum grid dimension Z */
+    HIP_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 7,        /**< Maximum shared memory available per block in bytes */
+    HIP_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 7,            /**< Deprecated, use HIP_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+    HIP_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 7, /**< Maximum optin shared memory per block */
+    HIP_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 8,              /**< Memory available on device for __constant__ variables in a HIP C kernel in bytes */
+    HIP_DEVICE_ATTRIBUTE_WARP_SIZE = 9,                         /**< Warp size in threads */
+    HIP_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 10,           /**< Maximum number of 32-bit registers available per block */
+    HIP_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 10,               /**< Deprecated, use HIP_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+    HIP_DEVICE_ATTRIBUTE_CLOCK_RATE = 11,                        /**< Typical clock frequency in kilohertz */
+    HIP_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 12,                 /**< Peak memory clock frequency in kilohertz */
+    HIP_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 13,           /**< Global memory bus width in bits */
+    HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 14,              /**< Number of multiprocessors on device */
+    HIP_DEVICE_ATTRIBUTE_COMPUTE_MODE = 15,                      /**< Compute mode (See ::HIPcomputemode for details) */
+    HIP_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 16,                     /**< Size of L2 cache in bytes */
+    HIP_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 17,    /**< Maximum resident threads per multiprocessor */
+    HIP_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 18,          /**< Major compute capability version number */
+    HIP_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 19,          /**< Minor compute capability version number */
+    HIP_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 20,                /**< Device can possibly execute multiple kernels concurrently */
+    HIP_DEVICE_ATTRIBUTE_PCI_BUS_ID = 21,                        /**< PCI bus ID of the device */
+    HIP_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 22,                     /**< PCI device ID of the device */
+    HIP_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 22,                     /**< PCI domain ID of the device */
+    HIP_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 23,  /**< Maximum shared memory available per multiprocessor in bytes */
+    HIP_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 24,                    /**< Device is on a multi-GPU board */
+    HIP_DEVICE_ATTRIBUTE_INTEGRATED = 25,                        /**< Device is integrated with host memory */
+    HIP_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 26,                /**< Device supports launching cooperative kernels via ::hipLaunchCooperativeKernel */
+    HIP_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 27,   /**< Device can participate in cooperative kernels launched via ::hipLaunchCooperativeKernelMultiDevice */
+    HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 28,           /**< Maximum 1D texture width */
+    HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 29,           /**< Maximum 2D texture width */
+    HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 30,          /**< Maximum 2D texture height */
+    HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 31,           /**< Maximum 3D texture width */
+    HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 32,          /**< Maximum 3D texture height */
+    HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 33,           /**< Maximum 3D texture depth */
+    
+    HIP_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 37,                 /**< Alignment requirement for textures */
+    HIP_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 38,           /**< Pitch alignment requirement for textures */
+    HIP_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 39,               /**< Specifies whether there is a run time limit on kernels */
+    HIP_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 40,               /**< Device can map host memory into HIP address space */
+    HIP_DEVICE_ATTRIBUTE_ECC_ENABLED = 41,                       /**< Device has ECC support enabled */
+    
+    HIP_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 47,                    /**< Device can allocate managed memory on this system */
+    HIP_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 48, /**< The host can directly access managed memory on the device without migration. */
+    HIP_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 49,         /**< Device can coherently access managed memory concurrently with the CPU */
+    HIP_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 50,            /**< Device supports coherently accessing pageable memory without calling HIPHostRegister on it */
+    HIP_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 51, /**< Device accesses pageable memory via the host's page tables. */
+    HIP_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 52,     /**< ::HIP_STREAM_WAIT_VALUE_NOR is supported. */
+    
+    
+    // HIP_DEVICE_ATTRIBUTE_MAX_PITCH = ,                         /**< Maximum pitch in bytes allowed by memory copies */
+    // HIP_DEVICE_ATTRIBUTE_GPU_OVERLAP = ,                       /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead HIP_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
+    // 
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = ,   /**< Maximum 2D layered texture width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = ,  /**< Maximum 2D layered texture height */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = ,  /**< Maximum layers in a 2D layered texture */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = ,     /**< Deprecated, use HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = ,    /**< Deprecated, use HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = , /**< Deprecated, use HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
+    // HIP_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT =,                 /**< Alignment requirement for surfaces */
+    // HIP_DEVICE_ATTRIBUTE_TCC_DRIVER = ,                        /**< Device is using TCC driver model */
+    // HIP_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = ,                /**< Number of asynchronous engines */
+    // HIP_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = ,                /**< Device shares a unified address space with the host */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = ,   /**< Maximum 1D layered texture width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = ,  /**< Maximum layers in a 1D layered texture */
+    // HIP_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = ,                  /**< Deprecated, do not use. */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = ,    /**< Maximum 2D texture width if HIP_ARRAY3D_TEXTURE_GATHER is set */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = ,   /**< Maximum 2D texture height if HIP_ARRAY3D_TEXTURE_GATHER is set */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = , /**< Alternate maximum 3D texture width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = ,/**< Alternate maximum 3D texture height */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = , /**< Alternate maximum 3D texture depth */
+    // 
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = ,      /**< Maximum cubemap texture width/height */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = ,  /**< Maximum cubemap layered texture width/height */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = , /**< Maximum layers in a cubemap layered texture */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = ,           /**< Maximum 1D surface width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = ,           /**< Maximum 2D surface width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = ,          /**< Maximum 2D surface height */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = ,           /**< Maximum 3D surface width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = ,          /**< Maximum 3D surface height */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = ,           /**< Maximum 3D surface depth */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = ,   /**< Maximum 1D layered surface width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = ,  /**< Maximum layers in a 1D layered surface */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = ,   /**< Maximum 2D layered surface width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = ,  /**< Maximum 2D layered surface height */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = ,  /**< Maximum layers in a 2D layered surface */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = ,      /**< Maximum cubemap surface width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = ,  /**< Maximum cubemap layered surface width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = , /**< Maximum layers in a cubemap layered surface */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = ,    /**< Maximum 1D linear texture width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = ,    /**< Maximum 2D linear texture width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = ,   /**< Maximum 2D linear texture height */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = ,    /**< Maximum 2D linear texture pitch in bytes */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = , /**< Maximum mipmapped 2D texture width */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = ,/**< Maximum mipmapped 2D texture height */
+    // HIP_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = , /**< Maximum mipmapped 1D texture width */
+    // HIP_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = ,       /**< Device supports stream priorities */
+    // HIP_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = ,         /**< Device supports caching globals in L1 */
+    // HIP_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = ,          /**< Device supports caching locals in L1 */
+    // HIP_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = ,  /**< Maximum number of 32-bit registers available per multiprocessor */
+    // HIP_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = ,           /**< Unique id for a group of devices on the same multi-GPU board */
+    // HIP_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = ,       /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
+    // HIP_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = ,  /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    // HIP_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = ,      /**< Device supports compute preemption. */
+    // HIP_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = , /**< Device can access host registered memory at the same virtual address as the CPU */
+    // HIP_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = ,            /**< ::hipStreamBatchMemOp and related APIs are supported. */
+    // HIP_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = ,     /**< 64-bit operations are supported in ::hipStreamBatchMemOp and related APIs. */
+    // HIP_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = ,           /**< Both the ::HIP_STREAM_WAIT_VALUE_FLUSH flag and the ::HIP_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref HIP_MEMOP for additional details. */
+    // HIP_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = ,           /**< Device supports host memory registration via ::HIPHostRegister. */
+    // HIP_DEVICE_ATTRIBUTE_MAX
+} HIPdevice_attribute;
+
+/**
+ * Function cache configurations
+ */
+typedef enum HIPfunc_cache_enum {
+    HIP_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
+    HIP_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
+    HIP_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
+    HIP_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
+} HIPfunc_cache;
+
+/**
+ * Shared memory configurations
+ */
+typedef enum HIPsharedconfig_enum {
+    HIP_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
+    HIP_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
+    HIP_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
+} HIPsharedconfig;
+
+/**
+ * Function properties
+ */
+typedef enum HIPfunction_attribute_enum {
+    /**
+     * The maximum number of threads per block, beyond which a launch of the
+     * function would fail. This number depends on both the function and the
+     * device on which the function is currently loaded.
+     */
+    HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+
+    /**
+     * The size in bytes of statically-allocated shared memory required by
+     * this function. This does not include dynamically-allocated shared
+     * memory requested by the user at runtime.
+     */
+    HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+
+    /**
+     * The size in bytes of user-allocated constant memory required by this
+     * function.
+     */
+    HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+
+    /**
+     * The size in bytes of local memory used by each thread of this function.
+     */
+    HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+
+    /**
+     * The number of registers used by each thread of this function.
+     */
+    HIP_FUNC_ATTRIBUTE_NUM_REGS = 4,
+
+    /**
+     * The PTX virtual architecture version for which the function was
+     * compiled. This value is the major PTX version * 10 + the minor PTX
+     * version, so a PTX version 1.3 function would return the value 13.
+     * Note that this may return the undefined value of 0 for cubins
+     * compiled prior to HIP 3.0.
+     */
+    HIP_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+
+    /**
+     * The binary architecture version for which the function was compiled.
+     * This value is the major binary version * 10 + the minor binary version,
+     * so a binary version 1.3 function would return the value 13. Note that
+     * this will return a value of 10 for legacy cubins that do not have a
+     * properly-encoded binary architecture version.
+     */
+    HIP_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+
+    /**
+     * The attribute to indicate whether the function has been compiled with
+     * user specified option "-Xptxas --dlcm=ca" set .
+     */
+    HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
+
+    /**
+     * The maximum size in bytes of dynamically-allocated shared memory that can be used by
+     * this function. If the user-specified dynamic shared memory size is larger than this
+     * value, the launch will fail.
+     * See ::hipFuncSetAttribute
+     */
+    HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+
+    /**
+     * On devices where the L1 cache and shared memory use the same hardware resources,
+     * this sets the shared memory carveout preference, in percent of the total shared memory.
+     * Refer to ::HIP_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
+     * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+     * See ::hipFuncSetAttribute
+     */
+    HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+
+    HIP_FUNC_ATTRIBUTE_MAX
+} HIPfunction_attribute;
+
+/**
+ * Context creation flags
+ */
+typedef enum HIPctx_flags_enum {
+    HIP_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
+    HIP_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
+    HIP_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
+    HIP_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+    HIP_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling
+                                         *  \deprecated This flag was deprecated as of HIP 4.0
+                                         *  and was replaced with ::HIP_CTX_SCHED_BLOCKING_SYNC. */
+    HIP_CTX_SCHED_MASK          = 0x07,
+    HIP_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
+    HIP_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
+    HIP_CTX_FLAGS_MASK          = 0x1f
+} HIPctx_flags;
+
+/**
+ * Stream creation flags
+ */
+typedef enum HIPstream_flags_enum {
+    HIP_STREAM_DEFAULT      = 0x0, /**< Default stream flag */
+    HIP_STREAM_NON_BLOCKING = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+} HIPstream_flags;
+
+/**
+ * Event creation flags
+ */
+typedef enum HIPevent_flags_enum {
+    HIP_EVENT_DEFAULT        = 0x0, /**< Default event flag */
+    HIP_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
+    HIP_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
+    HIP_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. HIP_EVENT_DISABLE_TIMING must be set */
+} HIPevent_flags;
+
+typedef enum HIPjitInputType_enum
+{
+    /**
+     * Compiled device-class-specific device code\n
+     * Applicable options: none
+     */
+    HIP_JIT_INPUT_HIPBIN = 0,
+
+    /**
+     * PTX source code\n
+     * Applicable options: PTX compiler options
+     */
+    HIP_JIT_INPUT_PTX,
+
+    /**
+     * Bundle of multiple cubins and/or PTX of some device code\n
+     * Applicable options: PTX compiler options, ::HIP_JIT_FALLBACK_STRATEGY
+     */
+    HIP_JIT_INPUT_FATBINARY,
+
+    /**
+     * Host object with embedded device code\n
+     * Applicable options: PTX compiler options, ::HIP_JIT_FALLBACK_STRATEGY
+     */
+    HIP_JIT_INPUT_OBJECT,
+
+    /**
+     * Archive of host objects with embedded device code\n
+     * Applicable options: PTX compiler options, ::HIP_JIT_FALLBACK_STRATEGY
+     */
+    HIP_JIT_INPUT_LIBRARY,
+
+    HIP_JIT_NUM_INPUT_TYPES
+} HIPjitInputType;
+
+#ifdef _WIN32
+#define HIPAPI __stdcall
+#else
+#define HIPAPI
+#endif
+
+#define HIP_API_CALL HIPAPI
+
+typedef HIPresult (HIP_API_CALL *HIP_HIPCTXCREATE)              (HIPcontext *, unsigned int, HIPdevice);
+typedef HIPresult (HIP_API_CALL *HIP_HIPCTXDESTROY)             (HIPcontext);
+typedef HIPresult (HIP_API_CALL *HIP_HIPCTXGETCACHECONFIG)      (HIPfunc_cache *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPCTXGETCURRENT)          (HIPcontext *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPCTXGETSHAREDMEMCONFIG)  (HIPsharedconfig *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPCTXPOPCURRENT)          (HIPcontext *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPCTXPUSHCURRENT)         (HIPcontext);
+typedef HIPresult (HIP_API_CALL *HIP_HIPCTXSETCACHECONFIG)      (HIPfunc_cache);
+typedef HIPresult (HIP_API_CALL *HIP_HIPCTXSETCURRENT)          (HIPcontext);
+typedef HIPresult (HIP_API_CALL *HIP_HIPCTXSETSHAREDMEMCONFIG)  (HIPsharedconfig);
+typedef HIPresult (HIP_API_CALL *HIP_HIPCTXSYNCHRONIZE)         ();
+typedef HIPresult (HIP_API_CALL *HIP_HIPDEVICEGETATTRIBUTE)     (int *, HIPdevice_attribute, HIPdevice);
+typedef HIPresult (HIP_API_CALL *HIP_HIPDEVICEGETCOUNT)         (int *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPDEVICEGET)              (HIPdevice *, int);
+typedef HIPresult (HIP_API_CALL *HIP_HIPDEVICEGETNAME)          (char *, int, HIPdevice);
+typedef HIPresult (HIP_API_CALL *HIP_HIPDEVICETOTALMEM)         (size_t *, HIPdevice);
+typedef HIPresult (HIP_API_CALL *HIP_HIPDRIVERGETVERSION)       (int *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPEVENTCREATE)            (HIPevent *, unsigned int);
+typedef HIPresult (HIP_API_CALL *HIP_HIPEVENTDESTROY)           (HIPevent);
+typedef HIPresult (HIP_API_CALL *HIP_HIPEVENTELAPSEDTIME)       (float *, HIPevent, HIPevent);
+typedef HIPresult (HIP_API_CALL *HIP_HIPEVENTQUERY)             (HIPevent);
+typedef HIPresult (HIP_API_CALL *HIP_HIPEVENTRECORD)            (HIPevent, HIPstream);
+typedef HIPresult (HIP_API_CALL *HIP_HIPEVENTSYNCHRONIZE)       (HIPevent);
+typedef HIPresult (HIP_API_CALL *HIP_HIPFUNCGETATTRIBUTE)       (int *, HIPfunction_attribute, HIPfunction);
+typedef HIPresult (HIP_API_CALL *HIP_HIPFUNCSETATTRIBUTE)       (HIPfunction, HIPfunction_attribute, int);
+typedef HIPresult (HIP_API_CALL *HIP_HIPFUNCSETCACHECONFIG)     (HIPfunction, HIPfunc_cache);
+typedef HIPresult (HIP_API_CALL *HIP_HIPFUNCSETSHAREDMEMCONFIG) (HIPfunction, HIPsharedconfig);
+typedef HIPresult (HIP_API_CALL *HIP_HIPGETERRORNAME)           (HIPresult, const char **);
+typedef HIPresult (HIP_API_CALL *HIP_HIPGETERRORSTRING)         (HIPresult, const char **);
+typedef HIPresult (HIP_API_CALL *HIP_HIPINIT)                   (unsigned int);
+typedef HIPresult (HIP_API_CALL *HIP_HIPLAUNCHKERNEL)           (HIPfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, HIPstream, void **, void **);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMEMALLOC)               (HIPdeviceptr *, size_t);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMEMALLOCHOST)           (void **, size_t);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMEMCPYDTOD)             (HIPdeviceptr, HIPdeviceptr, size_t);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMEMCPYDTOH)             (void *, HIPdeviceptr, size_t);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMEMCPYHTOD)             (HIPdeviceptr, const void *, size_t);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMEMFREE)                (HIPdeviceptr);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMEMFREEHOST)            (void *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMEMGETINFO)             (size_t *, size_t *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMEMSETD32)              (HIPdeviceptr, unsigned int, size_t);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMEMSETD8)               (HIPdeviceptr, unsigned char, size_t);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMODULEGETFUNCTION)      (HIPfunction *, HIPmodule, const char *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMODULEGETGLOBAL)        (HIPdeviceptr *, size_t *, HIPmodule, const char *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMODULELOAD)             (HIPmodule *, const char *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMODULELOADDATA)         (HIPmodule *, const void *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMODULELOADDATAEX)       (HIPmodule *, const void *, unsigned int, HIPjit_option *, void **);
+typedef HIPresult (HIP_API_CALL *HIP_HIPMODULEUNLOAD)           (HIPmodule);
+typedef HIPresult (HIP_API_CALL *HIP_HIPPROFILERSTART)          ();
+typedef HIPresult (HIP_API_CALL *HIP_HIPPROFILERSTOP)           ();
+typedef HIPresult (HIP_API_CALL *HIP_HIPSTREAMCREATE)           (HIPstream *, unsigned int);
+typedef HIPresult (HIP_API_CALL *HIP_HIPSTREAMDESTROY)          (HIPstream);
+typedef HIPresult (HIP_API_CALL *HIP_HIPSTREAMSYNCHRONIZE)      (HIPstream);
+typedef HIPresult (HIP_API_CALL *HIP_HIPSTREAMWAITEVENT)        (HIPstream, HIPevent, unsigned int);
+typedef HIPresult (HIP_API_CALL *HIP_HIPLINKCREATE)             (unsigned int, HIPjit_option *, void **, HIPlinkState *);
+typedef HIPresult (HIP_API_CALL *HIP_HIPLINKADDDATA)            (HIPlinkState, HIPjitInputType, void *, size_t, const char *, unsigned int, HIPjit_option *, void **);
+typedef HIPresult (HIP_API_CALL *HIP_HIPLINKDESTROY)            (HIPlinkState);
+typedef HIPresult (HIP_API_CALL *HIP_HIPLINKCOMPLETE)           (HIPlinkState, void **, size_t *);
+
+typedef struct hc_hip_lib
+{
+  hc_dynlib_t lib;
+
+  HIP_HIPCTXCREATE              hipCtxCreate;
+  HIP_HIPCTXDESTROY             hipCtxDestroy;
+  HIP_HIPCTXGETCACHECONFIG      hipCtxGetCacheConfig;
+  HIP_HIPCTXGETCURRENT          hipCtxGetCurrent;
+  HIP_HIPCTXGETSHAREDMEMCONFIG  hipCtxGetSharedMemConfig;
+  HIP_HIPCTXPOPCURRENT          hipCtxPopCurrent;
+  HIP_HIPCTXPUSHCURRENT         hipCtxPushCurrent;
+  HIP_HIPCTXSETCACHECONFIG      hipCtxSetCacheConfig;
+  HIP_HIPCTXSETCURRENT          hipCtxSetCurrent;
+  HIP_HIPCTXSETSHAREDMEMCONFIG  hipCtxSetSharedMemConfig;
+  HIP_HIPCTXSYNCHRONIZE         hipCtxSynchronize;
+  HIP_HIPDEVICEGETATTRIBUTE     hipDeviceGetAttribute;
+  HIP_HIPDEVICEGETCOUNT         hipDeviceGetCount;
+  HIP_HIPDEVICEGET              hipDeviceGet;
+  HIP_HIPDEVICEGETNAME          hipDeviceGetName;
+  HIP_HIPDEVICETOTALMEM         hipDeviceTotalMem;
+  HIP_HIPDRIVERGETVERSION       hipDriverGetVersion;
+  HIP_HIPEVENTCREATE            hipEventCreate;
+  HIP_HIPEVENTDESTROY           hipEventDestroy;
+  HIP_HIPEVENTELAPSEDTIME       hipEventElapsedTime;
+  HIP_HIPEVENTQUERY             hipEventQuery;
+  HIP_HIPEVENTRECORD            hipEventRecord;
+  HIP_HIPEVENTSYNCHRONIZE       hipEventSynchronize;
+  HIP_HIPFUNCGETATTRIBUTE       hipFuncGetAttribute;
+  HIP_HIPFUNCSETATTRIBUTE       hipFuncSetAttribute;
+  HIP_HIPFUNCSETCACHECONFIG     hipFuncSetCacheConfig;
+  HIP_HIPFUNCSETSHAREDMEMCONFIG hipFuncSetSharedMemConfig;
+  HIP_HIPGETERRORNAME           hipGetErrorName;
+  HIP_HIPGETERRORSTRING         hipGetErrorString;
+  HIP_HIPINIT                   hipInit;
+  HIP_HIPLAUNCHKERNEL           hipLaunchKernel;
+  HIP_HIPMEMALLOC               hipMemAlloc;
+  HIP_HIPMEMALLOCHOST           hipMemAllocHost;
+  HIP_HIPMEMCPYDTOD             hipMemcpyDtoD;
+  HIP_HIPMEMCPYDTOH             hipMemcpyDtoH;
+  HIP_HIPMEMCPYHTOD             hipMemcpyHtoD;
+  HIP_HIPMEMFREE                hipMemFree;
+  HIP_HIPMEMFREEHOST            hipMemFreeHost;
+  HIP_HIPMEMGETINFO             hipMemGetInfo;
+  HIP_HIPMEMSETD32              hipMemsetD32;
+  HIP_HIPMEMSETD8               hipMemsetD8;
+  HIP_HIPMODULEGETFUNCTION      hipModuleGetFunction;
+  HIP_HIPMODULEGETGLOBAL        hipModuleGetGlobal;
+  HIP_HIPMODULELOAD             hipModuleLoad;
+  HIP_HIPMODULELOADDATA         hipModuleLoadData;
+  HIP_HIPMODULELOADDATAEX       hipModuleLoadDataEx;
+  HIP_HIPMODULEUNLOAD           hipModuleUnload;
+  HIP_HIPPROFILERSTART          hipProfilerStart;
+  HIP_HIPPROFILERSTOP           hipProfilerStop;
+  HIP_HIPSTREAMCREATE           hipStreamCreate;
+  HIP_HIPSTREAMDESTROY          hipStreamDestroy;
+  HIP_HIPSTREAMSYNCHRONIZE      hipStreamSynchronize;
+  HIP_HIPSTREAMWAITEVENT        hipStreamWaitEvent;
+  HIP_HIPLINKCREATE             hipLinkCreate;
+  HIP_HIPLINKADDDATA            hipLinkAddData;
+  HIP_HIPLINKDESTROY            hipLinkDestroy;
+  HIP_HIPLINKCOMPLETE           hipLinkComplete;
+
+} hc_hip_lib_t;
+
+typedef hc_hip_lib_t HIP_PTR;
+
+#endif // _EXT_HIP_H
\ No newline at end of file
diff --git a/include/ext_hiprtc.h b/include/ext_hiprtc.h
new file mode 100644
index 000000000..cd1be6c4b
--- /dev/null
+++ b/include/ext_hiprtc.h
@@ -0,0 +1,87 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#ifndef _EXT_HIPRTC_H
+#define _EXT_HIPRTC_H
+
+/**
+ * from hip_runtime.h (/opt/rocm/hip/include/hip/amd_detail/hiprtc.h)
+ */
+
+/**
+ * \ingroup error
+ * \brief   The enumerated type hiprtcResult defines API call result codes.
+ *          HIPRTC API functions return hiprtcResult to indicate the call
+ *          result.
+ */
+typedef enum {
+  HIPRTC_SUCCESS = 0,
+  HIPRTC_ERROR_OUT_OF_MEMORY = 1,
+  HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  HIPRTC_ERROR_INVALID_INPUT = 3,
+  HIPRTC_ERROR_INVALID_PROGRAM = 4,
+  HIPRTC_ERROR_INVALID_OPTION = 5,
+  HIPRTC_ERROR_COMPILATION = 6,
+  HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  HIPRTC_ERROR_INTERNAL_ERROR = 11
+} hiprtcResult;
+
+/**
+ * \ingroup compilation
+ * \brief   hiprtcProgram is the unit of compilation, and an opaque handle for
+ *          a program.
+ *
+ * To compile a CUDA program string, an instance of hiprtcProgram must be
+ * created first with ::hiprtcCreateProgram, then compiled with
+ * ::hiprtcCompileProgram.
+ */
+typedef struct _hiprtcProgram *hiprtcProgram;
+
+#ifdef _WIN32
+#define HIPRTCAPI __stdcall
+#else
+#define HIPRTCAPI
+#endif
+
+#define HIPRTC_API_CALL HIPRTCAPI
+
+typedef hiprtcResult  (HIPRTC_API_CALL *HIPRTC_HIPRTCADDNAMEEXPRESSION)  (hiprtcProgram, const char * const);
+typedef hiprtcResult  (HIPRTC_API_CALL *HIPRTC_HIPRTCCOMPILEPROGRAM)     (hiprtcProgram, int, const char * const *);
+typedef hiprtcResult  (HIPRTC_API_CALL *HIPRTC_HIPRTCCREATEPROGRAM)      (hiprtcProgram *, const char *, const char *, int, const char * const *, const char * const *);
+typedef hiprtcResult  (HIPRTC_API_CALL *HIPRTC_HIPRTCDESTROYPROGRAM)     (hiprtcProgram *);
+typedef hiprtcResult  (HIPRTC_API_CALL *HIPRTC_HIPRTCGETLOWEREDNAME)     (hiprtcProgram, const char * const, const char **);
+typedef hiprtcResult  (HIPRTC_API_CALL *HIPRTC_HIPRTCGETPTX)             (hiprtcProgram, char *);
+typedef hiprtcResult  (HIPRTC_API_CALL *HIPRTC_HIPRTCGETPTXSIZE)         (hiprtcProgram, size_t *);
+typedef hiprtcResult  (HIPRTC_API_CALL *HIPRTC_HIPRTCGETPROGRAMLOG)      (hiprtcProgram, char *);
+typedef hiprtcResult  (HIPRTC_API_CALL *HIPRTC_HIPRTCGETPROGRAMLOGSIZE)  (hiprtcProgram, size_t *);
+typedef const char * (HIPRTC_API_CALL *HIPRTC_HIPRTCGETERRORSTRING)      (hiprtcResult);
+typedef hiprtcResult  (HIPRTC_API_CALL *HIPRTC_HIPRTCVERSION)            (int *, int *);
+
+typedef struct hc_hiprtc_lib
+{
+  hc_dynlib_t lib;
+
+  HIPRTC_HIPRTCADDNAMEEXPRESSION  hiprtcAddNameExpression;
+  HIPRTC_HIPRTCCOMPILEPROGRAM     hiprtcCompileProgram;
+  HIPRTC_HIPRTCCREATEPROGRAM      hiprtcCreateProgram;
+  HIPRTC_HIPRTCDESTROYPROGRAM     hiprtcDestroyProgram;
+  HIPRTC_HIPRTCGETLOWEREDNAME     hiprtcGetLoweredName;
+  HIPRTC_HIPRTCGETPTX             hiprtcGetCode;
+  HIPRTC_HIPRTCGETPTXSIZE         hiprtcGetCodeSize;
+  HIPRTC_HIPRTCGETPROGRAMLOG      hiprtcGetProgramLog;
+  HIPRTC_HIPRTCGETPROGRAMLOGSIZE  hiprtcGetProgramLogSize;
+  HIPRTC_HIPRTCGETERRORSTRING     hiprtcGetErrorString;
+  HIPRTC_HIPRTCVERSION            hiprtcVersion;
+
+} hc_hiprtc_lib_t;
+
+typedef hc_hiprtc_lib_t HIPRTC_PTR;
+
+int hiprtc_make_options_array_from_string (char *string, char **options);
+
+#endif // _EXT_HIPRTC_H
diff --git a/include/types.h b/include/types.h
index 05b427b9f..831af5a20 100644
--- a/include/types.h
+++ b/include/types.h
@@ -616,6 +616,7 @@ typedef enum user_options_defaults
   MARKOV_THRESHOLD         = 0,
   NONCE_ERROR_CORRECTIONS  = 8,
   BACKEND_IGNORE_CUDA      = false,
+  BACKEND_IGNORE_HIP       = false,
   BACKEND_IGNORE_OPENCL    = false,
   BACKEND_INFO             = false,
   BACKEND_VECTOR_WIDTH     = 0,
@@ -666,6 +667,7 @@ typedef enum user_options_map
   IDX_ATTACK_MODE               = 'a',
   IDX_BACKEND_DEVICES           = 'd',
   IDX_BACKEND_IGNORE_CUDA       = 0xff01,
+  IDX_BACKEND_IGNORE_HIP        = 0xff4d,
   IDX_BACKEND_IGNORE_OPENCL     = 0xff02,
   IDX_BACKEND_INFO              = 'I',
   IDX_BACKEND_VECTOR_WIDTH      = 0xff03,
@@ -1045,7 +1047,10 @@ typedef struct hc_fp
 } HCFILE;
 
 #include "ext_nvrtc.h"
+#include "ext_hiprtc.h"
+
 #include "ext_cuda.h"
+#include "ext_hip.h"
 #include "ext_OpenCL.h"
 
 typedef struct hc_device_param
@@ -1427,6 +1432,85 @@ typedef struct hc_device_param
   CUdeviceptr       cuda_d_st_salts_buf;
   CUdeviceptr       cuda_d_st_esalts_buf;
 
+  // API: hip
+
+  bool               is_hip;
+
+  int                hip_warp_size;
+
+  HIPdevice          hip_device;
+  HIPcontext         hip_context;
+  HIPstream          hip_stream;
+
+  HIPevent           hip_event1;
+  HIPevent           hip_event2;
+
+  HIPmodule          hip_module;
+  HIPmodule          hip_module_shared;
+  HIPmodule          hip_module_mp;
+  HIPmodule          hip_module_amp;
+
+  HIPfunction        hip_function1;
+  HIPfunction        hip_function12;
+  HIPfunction        hip_function2;
+  HIPfunction        hip_function2e;
+  HIPfunction        hip_function23;
+  HIPfunction        hip_function3;
+  HIPfunction        hip_function4;
+  HIPfunction        hip_function_init2;
+  HIPfunction        hip_function_loop2;
+  HIPfunction        hip_function_mp;
+  HIPfunction        hip_function_mp_l;
+  HIPfunction        hip_function_mp_r;
+  HIPfunction        hip_function_amp;
+  HIPfunction        hip_function_tm;
+  HIPfunction        hip_function_memset;
+  HIPfunction        hip_function_atinit;
+  HIPfunction        hip_function_decompress;
+  HIPfunction        hip_function_aux1;
+  HIPfunction        hip_function_aux2;
+  HIPfunction        hip_function_aux3;
+  HIPfunction        hip_function_aux4;
+
+  HIPdeviceptr       hip_d_pws_buf;
+  HIPdeviceptr       hip_d_pws_amp_buf;
+  HIPdeviceptr       hip_d_pws_comp_buf;
+  HIPdeviceptr       hip_d_pws_idx;
+  HIPdeviceptr       hip_d_words_buf_l;
+  HIPdeviceptr       hip_d_words_buf_r;
+  HIPdeviceptr       hip_d_rules;
+  HIPdeviceptr       hip_d_rules_c;
+  HIPdeviceptr       hip_d_combs;
+  HIPdeviceptr       hip_d_combs_c;
+  HIPdeviceptr       hip_d_bfs;
+  HIPdeviceptr       hip_d_bfs_c;
+  HIPdeviceptr       hip_d_tm_c;
+  HIPdeviceptr       hip_d_bitmap_s1_a;
+  HIPdeviceptr       hip_d_bitmap_s1_b;
+  HIPdeviceptr       hip_d_bitmap_s1_c;
+  HIPdeviceptr       hip_d_bitmap_s1_d;
+  HIPdeviceptr       hip_d_bitmap_s2_a;
+  HIPdeviceptr       hip_d_bitmap_s2_b;
+  HIPdeviceptr       hip_d_bitmap_s2_c;
+  HIPdeviceptr       hip_d_bitmap_s2_d;
+  HIPdeviceptr       hip_d_plain_bufs;
+  HIPdeviceptr       hip_d_digests_buf;
+  HIPdeviceptr       hip_d_digests_shown;
+  HIPdeviceptr       hip_d_salt_bufs;
+  HIPdeviceptr       hip_d_esalt_bufs;
+  HIPdeviceptr       hip_d_tmps;
+  HIPdeviceptr       hip_d_hooks;
+  HIPdeviceptr       hip_d_result;
+  HIPdeviceptr       hip_d_extra0_buf;
+  HIPdeviceptr       hip_d_extra1_buf;
+  HIPdeviceptr       hip_d_extra2_buf;
+  HIPdeviceptr       hip_d_extra3_buf;
+  HIPdeviceptr       hip_d_root_css_buf;
+  HIPdeviceptr       hip_d_markov_css_buf;
+  HIPdeviceptr       hip_d_st_digests_buf;
+  HIPdeviceptr       hip_d_st_salts_buf;
+  HIPdeviceptr       hip_d_st_esalts_buf;
+
   // API: opencl
 
   bool              is_opencl;
@@ -1519,9 +1603,13 @@ typedef struct backend_ctx
 
   void               *ocl;
   void               *cuda;
+  void               *hip;
+  
   void               *nvrtc;
+  void               *hiprtc;
 
   int                 backend_device_from_cuda[DEVICES_MAX];                              // from cuda device index to backend device index
+  int                 backend_device_from_hip[DEVICES_MAX];                               // from hip device index to backend device index
   int                 backend_device_from_opencl[DEVICES_MAX];                            // from opencl device index to backend device index
   int                 backend_device_from_opencl_platform[CL_PLATFORMS_MAX][DEVICES_MAX]; // from opencl device index to backend device index (by platform)
 
@@ -1529,6 +1617,8 @@ typedef struct backend_ctx
   int                 backend_devices_active;
   int                 cuda_devices_cnt;
   int                 cuda_devices_active;
+  int                 hip_devices_cnt;
+  int                 hip_devices_active;
   int                 opencl_devices_cnt;
   int                 opencl_devices_active;
 
@@ -1557,6 +1647,11 @@ typedef struct backend_ctx
   int                 nvrtc_driver_version;
   int                 cuda_driver_version;
 
+  // cuda
+
+  int                 hiprtc_driver_version;
+  int                 hip_driver_version;
+
   // opencl
 
   cl_platform_id     *opencl_platforms;
@@ -1947,6 +2042,7 @@ typedef struct user_options
   bool         markov_classic;
   bool         markov_disable;
   bool         backend_ignore_cuda;
+  bool         backend_ignore_hip;
   bool         backend_ignore_opencl;
   bool         backend_info;
   bool         optimized_kernel_enable;
diff --git a/src/Makefile b/src/Makefile
index c24414566..6a4f0e487 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -4,7 +4,7 @@
 ##
 
 SHARED                  ?= 0
-DEBUG                   := 0
+DEBUG                   := 1
 PRODUCTION              := 1
 PRODUCTION_VERSION      := v6.1.1
 ENABLE_CUBIN            ?= 1
@@ -309,7 +309,7 @@ EMU_OBJS_ALL            += emu_inc_truecrypt_crc32 emu_inc_truecrypt_keyfile emu
 EMU_OBJS_ALL            += emu_inc_hash_md4 emu_inc_hash_md5 emu_inc_hash_ripemd160 emu_inc_hash_sha1 emu_inc_hash_sha256 emu_inc_hash_sha384 emu_inc_hash_sha512 emu_inc_hash_streebog256 emu_inc_hash_streebog512 emu_inc_ecc_secp256k1
 EMU_OBJS_ALL            += emu_inc_cipher_aes emu_inc_cipher_camellia emu_inc_cipher_des emu_inc_cipher_kuznyechik emu_inc_cipher_serpent emu_inc_cipher_twofish
 
-OBJS_ALL                := affinity autotune backend benchmark bitmap bitops combinator common convert cpt cpu_crc32 debugfile dictstat dispatch dynloader event ext_ADL ext_cuda ext_nvapi ext_nvml ext_nvrtc ext_OpenCL ext_sysfs ext_lzma filehandling folder hashcat hashes hlfmt hwmon induct interface keyboard_layout locking logfile loopback memory monitor mpsp outfile_check outfile pidfile potfile restore rp rp_cpu selftest slow_candidates shared status stdout straight terminal thread timer tuningdb usage user_options wordlist $(EMU_OBJS_ALL)
+OBJS_ALL                := affinity autotune backend benchmark bitmap bitops combinator common convert cpt cpu_crc32 debugfile dictstat dispatch dynloader event ext_ADL ext_cuda ext_hip ext_nvapi ext_nvml ext_nvrtc ext_hiprtc ext_OpenCL ext_sysfs ext_lzma filehandling folder hashcat hashes hlfmt hwmon induct interface keyboard_layout locking logfile loopback memory monitor mpsp outfile_check outfile pidfile potfile restore rp rp_cpu selftest slow_candidates shared status stdout straight terminal thread timer tuningdb usage user_options wordlist $(EMU_OBJS_ALL)
 
 ifeq ($(ENABLE_BRAIN),1)
 OBJS_ALL                += brain
diff --git a/src/backend.c b/src/backend.c
index 58aa8094f..b51da8968 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -54,6 +54,8 @@ static bool is_same_device (const hc_device_param_t *src, const hc_device_param_
 
   if ((src->is_cuda == true) && (dst->is_cuda == true)) return false;
 
+  if ((src->is_hip == true) && (dst->is_hip == true)) return false;
+
   // But OpenCL can have aliases
 
   if ((src->is_opencl == true) && (dst->is_opencl == true))
@@ -116,7 +118,7 @@ static int backend_ctx_find_alias_devices (hashcat_ctx_t *hashcat_ctx)
 
       // this lets CUDA devices survive over OpenCL
 
-      if (alias_device->is_cuda == true) continue;
+      if ((alias_device->is_cuda == true) || (alias_device->is_hip == true)) continue;
 
         // this lets native OpenCL runtime survive over generic OpenCL runtime
 
@@ -141,6 +143,7 @@ static bool is_same_device_type (const hc_device_param_t *src, const hc_device_p
   if (strcmp (src->device_name, dst->device_name) != 0) return false;
 
   if (src->is_cuda   != dst->is_cuda)   return false;
+  if (src->is_hip    != dst->is_hip)    return false;
   if (src->is_opencl != dst->is_opencl) return false;
 
   if (strcmp (src->device_name, dst->device_name) != 0) return false;
@@ -779,6 +782,45 @@ int nvrtc_init (hashcat_ctx_t *hashcat_ctx)
   return 0;
 }
 
+// HIPRTC
+
+int hiprtc_init (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIPRTC_PTR *hiprtc = (HIPRTC_PTR *) backend_ctx->hiprtc;
+
+  memset (hiprtc, 0, sizeof (HIPRTC_PTR));
+
+  #if   defined (_WIN)
+  hiprtc->lib = hc_dlopen ("fixme.dll");
+  #elif defined (__APPLE__)
+  hiprtc->lib = hc_dlopen ("fixme.dylib");
+  #elif defined (__CYGWIN__)
+  hiprtc->lib = hc_dlopen ("fixme.dll");
+  #else
+  hiprtc->lib = hc_dlopen ("libamdhip64.so");
+
+  if (hiprtc->lib == NULL) hiprtc->lib = hc_dlopen ("libamdhip64.so.4");
+  #endif
+
+  if (hiprtc->lib == NULL) return -1;
+
+  HC_LOAD_FUNC (hiprtc, hiprtcAddNameExpression,  HIPRTC_HIPRTCADDNAMEEXPRESSION, HIPRTC, 1);
+  HC_LOAD_FUNC (hiprtc, hiprtcCompileProgram,     HIPRTC_HIPRTCCOMPILEPROGRAM,    HIPRTC, 1);
+  HC_LOAD_FUNC (hiprtc, hiprtcCreateProgram,      HIPRTC_HIPRTCCREATEPROGRAM,     HIPRTC, 1);
+  HC_LOAD_FUNC (hiprtc, hiprtcDestroyProgram,     HIPRTC_HIPRTCDESTROYPROGRAM,    HIPRTC, 1);
+  HC_LOAD_FUNC (hiprtc, hiprtcGetLoweredName,     HIPRTC_HIPRTCGETLOWEREDNAME,    HIPRTC, 1);
+  HC_LOAD_FUNC (hiprtc, hiprtcGetCode,            HIPRTC_HIPRTCGETPTX,            HIPRTC, 1);
+  HC_LOAD_FUNC (hiprtc, hiprtcGetCodeSize,        HIPRTC_HIPRTCGETPTXSIZE,        HIPRTC, 1);
+  HC_LOAD_FUNC (hiprtc, hiprtcGetProgramLog,      HIPRTC_HIPRTCGETPROGRAMLOG,     HIPRTC, 1);
+  HC_LOAD_FUNC (hiprtc, hiprtcGetProgramLogSize,  HIPRTC_HIPRTCGETPROGRAMLOGSIZE, HIPRTC, 1);
+  HC_LOAD_FUNC (hiprtc, hiprtcGetErrorString,     HIPRTC_HIPRTCGETERRORSTRING,    HIPRTC, 1);
+  HC_LOAD_FUNC (hiprtc, hiprtcVersion,            HIPRTC_HIPRTCVERSION,           HIPRTC, 1);
+
+  return 0;
+}
+
 void nvrtc_close (hashcat_ctx_t *hashcat_ctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -798,6 +840,25 @@ void nvrtc_close (hashcat_ctx_t *hashcat_ctx)
   }
 }
 
+void hiprtc_close (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIPRTC_PTR *hiprtc = (HIPRTC_PTR *) backend_ctx->hiprtc;
+
+  if (hiprtc)
+  {
+    if (hiprtc->lib)
+    {
+      hc_dlclose (hiprtc->lib);
+    }
+
+    hcfree (backend_ctx->hiprtc);
+
+    backend_ctx->hiprtc = NULL;
+  }
+}
+
 int hc_nvrtcCreateProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -816,6 +877,24 @@ int hc_nvrtcCreateProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog, const
   return 0;
 }
 
+int hc_hiprtcCreateProgram (hashcat_ctx_t *hashcat_ctx, hiprtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIPRTC_PTR *hiprtc = (HIPRTC_PTR *) backend_ctx->hiprtc;
+
+  const hiprtcResult HIPRTC_err = hiprtc->hiprtcCreateProgram (prog, src, name, numHeaders, headers, includeNames);
+
+  if (HIPRTC_err != HIPRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "hiprtcCreateProgram(): %s", hiprtc->hiprtcGetErrorString (HIPRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_nvrtcDestroyProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -834,6 +913,24 @@ int hc_nvrtcDestroyProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog)
   return 0;
 }
 
+int hc_hiprtcDestroyProgram (hashcat_ctx_t *hashcat_ctx, hiprtcProgram *prog)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIPRTC_PTR *hiprtc = (HIPRTC_PTR *) backend_ctx->hiprtc;
+
+  const hiprtcResult HIPRTC_err = hiprtc->hiprtcDestroyProgram (prog);
+
+  if (HIPRTC_err != HIPRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "hiprtcDestroyProgram(): %s", hiprtc->hiprtcGetErrorString (HIPRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_nvrtcCompileProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, int numOptions, const char * const *options)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -852,6 +949,27 @@ int hc_nvrtcCompileProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, int n
   return 0;
 }
 
+int hc_hiprtcCompileProgram (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, int numOptions, const char * const *options)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIPRTC_PTR *hiprtc = (HIPRTC_PTR *) backend_ctx->hiprtc;
+#if 0  
+  for(int i =0; i< numOptions; i++)
+    printf("Option_%d = %s\n", i, options[i]);
+#endif  
+  const hiprtcResult HIPRTC_err = hiprtc->hiprtcCompileProgram (prog, numOptions, options);
+
+  if (HIPRTC_err != HIPRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "hiprtcCompileProgram(): %s", hiprtc->hiprtcGetErrorString (HIPRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_nvrtcGetProgramLogSize (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *logSizeRet)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -870,6 +988,24 @@ int hc_nvrtcGetProgramLogSize (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, si
   return 0;
 }
 
+int hc_hiprtcGetProgramLogSize (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, size_t *logSizeRet)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIPRTC_PTR *hiprtc = (HIPRTC_PTR *) backend_ctx->hiprtc;
+
+  const hiprtcResult HIPRTC_err = hiprtc->hiprtcGetProgramLogSize (prog, logSizeRet);
+
+  if (HIPRTC_err != HIPRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "hiprtcGetProgramLogSize(): %s", hiprtc->hiprtcGetErrorString (HIPRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_nvrtcGetProgramLog (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *log)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -888,6 +1024,24 @@ int hc_nvrtcGetProgramLog (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *
   return 0;
 }
 
+int hc_hiprtcGetProgramLog (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, char *log)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIPRTC_PTR *hiprtc = (HIPRTC_PTR *) backend_ctx->hiprtc;
+
+  const hiprtcResult HIPRTC_err = hiprtc->hiprtcGetProgramLog (prog, log);
+
+  if (HIPRTC_err != HIPRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "hiprtcGetProgramLog(): %s", hiprtc->hiprtcGetErrorString (HIPRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_nvrtcGetPTXSize (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *ptxSizeRet)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -906,6 +1060,24 @@ int hc_nvrtcGetPTXSize (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *p
   return 0;
 }
 
+int hc_hiprtcGetCodeSize (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, size_t *ptxSizeRet)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIPRTC_PTR *hiprtc = (HIPRTC_PTR *) backend_ctx->hiprtc;
+
+  const hiprtcResult HIPRTC_err = hiprtc->hiprtcGetCodeSize (prog, ptxSizeRet);
+
+  if (HIPRTC_err != HIPRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "hiprtcGetCodeSize(): %s", hiprtc->hiprtcGetErrorString (HIPRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_nvrtcGetPTX (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *ptx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -924,6 +1096,24 @@ int hc_nvrtcGetPTX (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *ptx)
   return 0;
 }
 
+int hc_hiprtcGetCode (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, char *ptx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIPRTC_PTR *hiprtc = (HIPRTC_PTR *) backend_ctx->hiprtc;
+
+  const hiprtcResult HIPRTC_err = hiprtc->hiprtcGetCode (prog, ptx);
+
+  if (HIPRTC_err != HIPRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "hiprtcGetCode(): %s", hiprtc->hiprtcGetErrorString (HIPRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_nvrtcVersion (hashcat_ctx_t *hashcat_ctx, int *major, int *minor)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -942,6 +1132,24 @@ int hc_nvrtcVersion (hashcat_ctx_t *hashcat_ctx, int *major, int *minor)
   return 0;
 }
 
+int hc_hiprtcVersion (hashcat_ctx_t *hashcat_ctx, int *major, int *minor)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIPRTC_PTR *hiprtc = (HIPRTC_PTR *) backend_ctx->hiprtc;
+
+  const hiprtcResult HIPRTC_err = hiprtc->hiprtcVersion (major, minor);
+
+  if (HIPRTC_err != HIPRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "hiprtcVersion(): %s", hiprtc->hiprtcGetErrorString (HIPRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
 // CUDA
 
 int cuda_init (hashcat_ctx_t *hashcat_ctx)
@@ -1050,6 +1258,116 @@ int cuda_init (hashcat_ctx_t *hashcat_ctx)
   return 0;
 }
 
+// HIP
+
+int hip_init (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  memset (hip, 0, sizeof (HIP_PTR));
+
+  #if   defined (_WIN)
+  hip->lib = hc_dlopen ("fixme.dll");
+  #elif defined (__APPLE__)
+  hip->lib = hc_dlopen ("fixme.dylib");
+  #elif defined (__CYGWIN__)
+  hip->lib = hc_dlopen ("fixme.dll");
+  #else
+  hip->lib = hc_dlopen ("libamdhip64.so");
+
+  //TODO: grab the 4 from the major RT version
+  if (hip->lib == NULL) hip->lib = hc_dlopen ("libamdhip64.so.4.2.40200");
+  #endif
+
+  if (hip->lib == NULL) return -1;
+
+  // finding the right symbol is a PITA, 
+  #define HC_LOAD_FUNC_HIP(ptr,name,hipname,type,libname,noerr) \
+    do { \
+      ptr->name = (type) hc_dlsym ((ptr)->lib, #hipname); \
+      if ((noerr) != -1) { \
+        if (!(ptr)->name) { \
+          if ((noerr) == 1) { \
+            event_log_error (hashcat_ctx, "%s is missing from %s shared library.", #name, #libname); \
+            return -1; \
+          } \
+          if ((noerr) != 1) { \
+            event_log_warning (hashcat_ctx, "%s is missing from %s shared library.", #name, #libname); \
+            return 0; \
+          } \
+        } \
+      } \
+    } while (0)
+
+  // finding the right symbol is a PITA, because of the _v2 suffix
+  // a good reference is cuda.h itself
+  // this needs to be verified for each new cuda release
+
+  HC_LOAD_FUNC_HIP (hip, hipCtxCreate,              hipCtxCreate,               HIP_HIPCTXCREATE,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxDestroy,             hipCtxDestroy,              HIP_HIPCTXDESTROY,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxGetCacheConfig,      hipCtxGetCacheConfig,       HIP_HIPCTXGETCACHECONFIG,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxGetCurrent,          hipCtxGetCurrent,           HIP_HIPCTXGETCURRENT,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxGetSharedMemConfig,  hipCtxGetSharedMemConfig,   HIP_HIPCTXGETSHAREDMEMCONFIG,   HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent,          hipCtxPopCurrent,           HIP_HIPCTXPOPCURRENT,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent,         hipCtxPushCurrent,          HIP_HIPCTXPUSHCURRENT,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSetCacheConfig,      hipCtxSetCacheConfig,       HIP_HIPCTXSETCACHECONFIG,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent,          hipCtxSetCurrent,           HIP_HIPCTXSETCURRENT,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSetSharedMemConfig,  hipCtxSetSharedMemConfig,   HIP_HIPCTXSETSHAREDMEMCONFIG,   HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize,         hipCtxSynchronize,          HIP_HIPCTXSYNCHRONIZE,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute,     hipDeviceGetAttribute,      HIP_HIPDEVICEGETATTRIBUTE,      HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount,         hipGetDeviceCount,          HIP_HIPDEVICEGETCOUNT,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGet,              hipDeviceGet,               HIP_HIPDEVICEGET,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetName,          hipDeviceGetName,           HIP_HIPDEVICEGETNAME,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceTotalMem,         hipDeviceTotalMem,          HIP_HIPDEVICETOTALMEM,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDriverGetVersion,       hipDriverGetVersion,        HIP_HIPDRIVERGETVERSION,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventCreate,            hipEventCreateWithFlags,    HIP_HIPEVENTCREATE,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventDestroy,           hipEventDestroy,            HIP_HIPEVENTDESTROY,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventElapsedTime,       hipEventElapsedTime,        HIP_HIPEVENTELAPSEDTIME,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventQuery,             hipEventQuery,              HIP_HIPEVENTQUERY,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventRecord,            hipEventRecord,             HIP_HIPEVENTRECORD,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventSynchronize,       hipEventSynchronize,        HIP_HIPEVENTSYNCHRONIZE,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute,       hipFuncGetAttribute,        HIP_HIPFUNCGETATTRIBUTE,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipFuncSetAttribute,       hipFuncSetAttribute,        HIP_HIPFUNCSETATTRIBUTE,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipFuncSetCacheConfig,     hipFuncSetCacheConfig,      HIP_HIPFUNCSETCACHECONFIG,      HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipFuncSetSharedMemConfig, hipFuncSetSharedMemConfig,  HIP_HIPFUNCSETSHAREDMEMCONFIG,  HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorName,           hipGetErrorName,            HIP_HIPGETERRORNAME,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorString,         hipGetErrorString,          HIP_HIPGETERRORSTRING,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipInit,                   hipInit,                    HIP_HIPINIT,                    HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipLaunchKernel,           hipModuleLaunchKernel,      HIP_HIPLAUNCHKERNEL,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemAlloc,               hipMalloc,                  HIP_HIPMEMALLOC,                HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemAllocHost,           hipMemAllocHost,            HIP_HIPMEMALLOCHOST,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoD,             hipMemcpyDtoD,              HIP_HIPMEMCPYDTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoH,             hipMemcpyDtoH,              HIP_HIPMEMCPYDTOH,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoD,             hipMemcpyHtoD,              HIP_HIPMEMCPYHTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemFree,                hipFree,                    HIP_HIPMEMFREE,                 HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemFreeHost,            hipFreeHost,                HIP_HIPMEMFREEHOST,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemGetInfo,             hipMemGetInfo,              HIP_HIPMEMGETINFO,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD32,              hipMemsetD32,               HIP_HIPMEMSETD32,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD8,               hipMemsetD8,                HIP_HIPMEMSETD8,                HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction,      hipModuleGetFunction,       HIP_HIPMODULEGETFUNCTION,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal,        hipModuleGetGlobal,         HIP_HIPMODULEGETGLOBAL,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleLoad,             hipModuleLoad,              HIP_HIPMODULELOAD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleLoadData,         hipModuleLoadData,          HIP_HIPMODULELOADDATA,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx,       hipModuleLoadDataEx,        HIP_HIPMODULELOADDATAEX,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleUnload,           hipModuleUnload,            HIP_HIPMODULEUNLOAD,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipProfilerStart,          hipProfilerStart,           HIP_HIPPROFILERSTART,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipProfilerStop,           hipProfilerStop,            HIP_HIPPROFILERSTOP,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamCreate,           hipStreamCreate,            HIP_HIPSTREAMCREATE,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamDestroy,          hipStreamDestroy,           HIP_HIPSTREAMDESTROY,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize,      hipStreamSynchronize,       HIP_HIPSTREAMSYNCHRONIZE,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamWaitEvent,        hipStreamWaitEvent,         HIP_HIPSTREAMWAITEVENT,         HIP, 1);
+  #if defined (WITH_CUBINX)
+  HC_LOAD_FUNC_HIP (hip, hipLinkCreate,             hipLinkCreate,              HIP_HIPLINKCREATE,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipLinkAddData,            hipLinkAddData,             HIP_HIPLINKADDDATA,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipLinkDestroy,            hipLinkDestroy,             HIP_HIPLINKDESTROY,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipLinkComplete,           hipLinkComplete,            HIP_HIPLINKCOMPLETE,            HIP, 1);
+  #endif
+  
+  return 0;
+}
+
 void cuda_close (hashcat_ctx_t *hashcat_ctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -1069,6 +1387,25 @@ void cuda_close (hashcat_ctx_t *hashcat_ctx)
   }
 }
 
+void hip_close (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  if (hip)
+  {
+    if (hip->lib)
+    {
+      hc_dlclose (hip->lib);
+    }
+
+    hcfree (backend_ctx->hip);
+
+    backend_ctx->hip = NULL;
+  }
+}
+
 int hc_cuInit (hashcat_ctx_t *hashcat_ctx, unsigned int Flags)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -1096,25 +1433,25 @@ int hc_cuInit (hashcat_ctx_t *hashcat_ctx, unsigned int Flags)
   return 0;
 }
 
-int hc_cuDeviceGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev)
+int hc_hipInit (hashcat_ctx_t *hashcat_ctx, unsigned int Flags)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuDeviceGetAttribute (pi, attrib, dev);
+  const HIPresult HIP_err = hip->hipInit (Flags);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuDeviceGetAttribute(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipInit(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuDeviceGetAttribute(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipInit(): %d", HIP_err);
     }
 
     return -1;
@@ -1123,13 +1460,13 @@ int hc_cuDeviceGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attri
   return 0;
 }
 
-int hc_cuDeviceGetCount (hashcat_ctx_t *hashcat_ctx, int *count)
+int hc_cuDeviceGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuDeviceGetCount (count);
+  const CUresult CU_err = cuda->cuDeviceGetAttribute (pi, attrib, dev);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1137,11 +1474,11 @@ int hc_cuDeviceGetCount (hashcat_ctx_t *hashcat_ctx, int *count)
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuDeviceGetCount(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuDeviceGetAttribute(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuDeviceGetCount(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuDeviceGetAttribute(): %d", CU_err);
     }
 
     return -1;
@@ -1150,25 +1487,26 @@ int hc_cuDeviceGetCount (hashcat_ctx_t *hashcat_ctx, int *count)
   return 0;
 }
 
-int hc_cuDeviceGet (hashcat_ctx_t *hashcat_ctx, CUdevice* device, int ordinal)
+int hc_hipDeviceGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, HIPdevice_attribute attrib, HIPdevice dev)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuDeviceGet (device, ordinal);
+  if(attrib == -1) return 0;
+  const HIPresult HIP_err = hip->hipDeviceGetAttribute (pi, attrib, dev);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuDeviceGet(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipDeviceGetAttribute(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuDeviceGet(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipDeviceGetAttribute(): %d", HIP_err);
     }
 
     return -1;
@@ -1177,13 +1515,13 @@ int hc_cuDeviceGet (hashcat_ctx_t *hashcat_ctx, CUdevice* device, int ordinal)
   return 0;
 }
 
-int hc_cuDeviceGetName (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevice dev)
+int hc_cuDeviceGetCount (hashcat_ctx_t *hashcat_ctx, int *count)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuDeviceGetName (name, len, dev);
+  const CUresult CU_err = cuda->cuDeviceGetCount (count);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1191,11 +1529,11 @@ int hc_cuDeviceGetName (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevic
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuDeviceGetName(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuDeviceGetCount(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuDeviceGetName(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuDeviceGetCount(): %d", CU_err);
     }
 
     return -1;
@@ -1204,25 +1542,25 @@ int hc_cuDeviceGetName (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevic
   return 0;
 }
 
-int hc_cuDeviceTotalMem (hashcat_ctx_t *hashcat_ctx, size_t *bytes, CUdevice dev)
+int hc_hipDeviceGetCount (hashcat_ctx_t *hashcat_ctx, int *count)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuDeviceTotalMem (bytes, dev);
+  const HIPresult HIP_err = hip->hipDeviceGetCount (count);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuDeviceTotalMem(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipDeviceGetCount(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuDeviceTotalMem(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipDeviceGetCount(): %d", HIP_err);
     }
 
     return -1;
@@ -1231,13 +1569,13 @@ int hc_cuDeviceTotalMem (hashcat_ctx_t *hashcat_ctx, size_t *bytes, CUdevice dev
   return 0;
 }
 
-int hc_cuDriverGetVersion (hashcat_ctx_t *hashcat_ctx, int *driverVersion)
+int hc_cuDeviceGet (hashcat_ctx_t *hashcat_ctx, CUdevice* device, int ordinal)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuDriverGetVersion (driverVersion);
+  const CUresult CU_err = cuda->cuDeviceGet (device, ordinal);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1245,11 +1583,11 @@ int hc_cuDriverGetVersion (hashcat_ctx_t *hashcat_ctx, int *driverVersion)
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuDriverGetVersion(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuDeviceGet(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuDriverGetVersion(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuDeviceGet(): %d", CU_err);
     }
 
     return -1;
@@ -1258,25 +1596,25 @@ int hc_cuDriverGetVersion (hashcat_ctx_t *hashcat_ctx, int *driverVersion)
   return 0;
 }
 
-int hc_cuCtxCreate (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev)
+int hc_hipDeviceGet (hashcat_ctx_t *hashcat_ctx, HIPdevice* device, int ordinal)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuCtxCreate (pctx, flags, dev);
+  const HIPresult HIP_err = hip->hipDeviceGet (device, ordinal);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuCtxCreate(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipDeviceGet(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuCtxCreate(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipDeviceGet(): %d", HIP_err);
     }
 
     return -1;
@@ -1285,13 +1623,13 @@ int hc_cuCtxCreate (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int fl
   return 0;
 }
 
-int hc_cuCtxDestroy (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
+int hc_cuDeviceGetName (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevice dev)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuCtxDestroy (ctx);
+  const CUresult CU_err = cuda->cuDeviceGetName (name, len, dev);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1299,11 +1637,11 @@ int hc_cuCtxDestroy (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuCtxDestroy(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuDeviceGetName(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuCtxDestroy(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuDeviceGetName(): %d", CU_err);
     }
 
     return -1;
@@ -1312,25 +1650,25 @@ int hc_cuCtxDestroy (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
   return 0;
 }
 
-int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)
+int hc_hipDeviceGetName (hashcat_ctx_t *hashcat_ctx, char *name, int len, HIPdevice dev)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuModuleLoadDataEx (module, image, numOptions, options, optionValues);
+  const HIPresult HIP_err = hip->hipDeviceGetName (name, len, dev);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuModuleLoadDataEx(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipDeviceGetName(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuModuleLoadDataEx(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipDeviceGetName(): %d", HIP_err);
     }
 
     return -1;
@@ -1339,13 +1677,13 @@ int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const v
   return 0;
 }
 
-int hc_cuModuleUnload (hashcat_ctx_t *hashcat_ctx, CUmodule hmod)
+int hc_cuDeviceTotalMem (hashcat_ctx_t *hashcat_ctx, size_t *bytes, CUdevice dev)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuModuleUnload (hmod);
+  const CUresult CU_err = cuda->cuDeviceTotalMem (bytes, dev);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1353,11 +1691,11 @@ int hc_cuModuleUnload (hashcat_ctx_t *hashcat_ctx, CUmodule hmod)
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuModuleUnload(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuDeviceTotalMem(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuModuleUnload(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuDeviceTotalMem(): %d", CU_err);
     }
 
     return -1;
@@ -1366,25 +1704,25 @@ int hc_cuModuleUnload (hashcat_ctx_t *hashcat_ctx, CUmodule hmod)
   return 0;
 }
 
-int hc_cuCtxSetCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
+int hc_hipDeviceTotalMem (hashcat_ctx_t *hashcat_ctx, size_t *bytes, HIPdevice dev)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuCtxSetCurrent (ctx);
+  const HIPresult HIP_err = hip->hipDeviceTotalMem (bytes, dev);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuCtxSetCurrent(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipDeviceTotalMem(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuCtxSetCurrent(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipDeviceTotalMem(): %d", HIP_err);
     }
 
     return -1;
@@ -1393,13 +1731,13 @@ int hc_cuCtxSetCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
   return 0;
 }
 
-int hc_cuMemAlloc (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize)
+int hc_cuDriverGetVersion (hashcat_ctx_t *hashcat_ctx, int *driverVersion)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuMemAlloc (dptr, bytesize);
+  const CUresult CU_err = cuda->cuDriverGetVersion (driverVersion);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1407,11 +1745,11 @@ int hc_cuMemAlloc (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesiz
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuMemAlloc(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuDriverGetVersion(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuMemAlloc(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuDriverGetVersion(): %d", CU_err);
     }
 
     return -1;
@@ -1420,25 +1758,25 @@ int hc_cuMemAlloc (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesiz
   return 0;
 }
 
-int hc_cuMemFree (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr)
+int hc_hipDriverGetVersion (hashcat_ctx_t *hashcat_ctx, int *driverVersion)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuMemFree (dptr);
+  const HIPresult HIP_err = hip->hipDriverGetVersion (driverVersion);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuMemFree(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipDriverGetVersion(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuMemFree(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipDriverGetVersion(): %d", HIP_err);
     }
 
     return -1;
@@ -1447,13 +1785,13 @@ int hc_cuMemFree (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr)
   return 0;
 }
 
-int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount)
+int hc_cuCtxCreate (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuMemcpyDtoH (dstHost, srcDevice, ByteCount);
+  const CUresult CU_err = cuda->cuCtxCreate (pctx, flags, dev);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1461,11 +1799,11 @@ int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcD
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuCtxCreate(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuCtxCreate(): %d", CU_err);
     }
 
     return -1;
@@ -1474,25 +1812,25 @@ int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcD
   return 0;
 }
 
-int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount)
+int hc_hipCtxCreate (hashcat_ctx_t *hashcat_ctx, HIPcontext *pctx, unsigned int flags, HIPdevice dev)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuMemcpyDtoD (dstDevice, srcDevice, ByteCount);
+  const HIPresult HIP_err = hip->hipCtxCreate (pctx, flags, dev);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipCtxCreate(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipCtxCreate(): %d", HIP_err);
     }
 
     return -1;
@@ -1501,13 +1839,13 @@ int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdevice
   return 0;
 }
 
-int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
+int hc_cuCtxDestroy (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuMemcpyHtoD (dstDevice, srcHost, ByteCount);
+  const CUresult CU_err = cuda->cuCtxDestroy (ctx);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1515,11 +1853,11 @@ int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const vo
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuCtxDestroy(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuCtxDestroy(): %d", CU_err);
     }
 
     return -1;
@@ -1528,25 +1866,25 @@ int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const vo
   return 0;
 }
 
-int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name)
+int hc_hipCtxDestroy (hashcat_ctx_t *hashcat_ctx, HIPcontext ctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuModuleGetFunction (hfunc, hmod, name);
+  const HIPresult HIP_err = hip->hipCtxDestroy (ctx);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuModuleGetFunction(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipCtxDestroy(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuModuleGetFunction(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipCtxDestroy(): %d", HIP_err);
     }
 
     return -1;
@@ -1555,13 +1893,13 @@ int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmod
   return 0;
 }
 
-int hc_cuModuleGetGlobal (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name)
+int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuModuleGetGlobal (dptr, bytes, hmod, name);
+  const CUresult CU_err = cuda->cuModuleLoadDataEx (module, image, numOptions, options, optionValues);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1569,11 +1907,11 @@ int hc_cuModuleGetGlobal (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuModuleLoadDataEx(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuModuleLoadDataEx(): %d", CU_err);
     }
 
     return -1;
@@ -1582,25 +1920,25 @@ int hc_cuModuleGetGlobal (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t
   return 0;
 }
 
-int hc_cuMemGetInfo (hashcat_ctx_t *hashcat_ctx, size_t *free, size_t *total)
+int hc_hipModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, HIPmodule *module, const void *image, unsigned int numOptions, HIPjit_option *options, void **optionValues)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuMemGetInfo (free, total);
+  const HIPresult HIP_err = hip->hipModuleLoadDataEx (module, image, numOptions, options, optionValues);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuMemGetInfo(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipModuleLoadDataEx(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuMemGetInfo(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipModuleLoadDataEx(): %d", HIP_err);
     }
 
     return -1;
@@ -1609,13 +1947,13 @@ int hc_cuMemGetInfo (hashcat_ctx_t *hashcat_ctx, size_t *free, size_t *total)
   return 0;
 }
 
-int hc_cuFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc)
+int hc_cuModuleUnload (hashcat_ctx_t *hashcat_ctx, CUmodule hmod)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuFuncGetAttribute (pi, attrib, hfunc);
+  const CUresult CU_err = cuda->cuModuleUnload (hmod);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1623,11 +1961,11 @@ int hc_cuFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attri
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuFuncGetAttribute(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuModuleUnload(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuFuncGetAttribute(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuModuleUnload(): %d", CU_err);
     }
 
     return -1;
@@ -1636,25 +1974,25 @@ int hc_cuFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attri
   return 0;
 }
 
-int hc_cuFuncSetAttribute (hashcat_ctx_t *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value)
+int hc_hipModuleUnload (hashcat_ctx_t *hashcat_ctx, HIPmodule hmod)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuFuncSetAttribute (hfunc, attrib, value);
+  const HIPresult HIP_err = hip->hipModuleUnload (hmod);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipModuleUnload(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipModuleUnload(): %d", HIP_err);
     }
 
     return -1;
@@ -1663,13 +2001,13 @@ int hc_cuFuncSetAttribute (hashcat_ctx_t *hashcat_ctx, CUfunction hfunc, CUfunct
   return 0;
 }
 
-int hc_cuStreamCreate (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags)
+int hc_cuCtxSetCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuStreamCreate (phStream, Flags);
+  const CUresult CU_err = cuda->cuCtxSetCurrent (ctx);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1677,11 +2015,11 @@ int hc_cuStreamCreate (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuStreamCreate(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuCtxSetCurrent(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuStreamCreate(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuCtxSetCurrent(): %d", CU_err);
     }
 
     return -1;
@@ -1690,25 +2028,25 @@ int hc_cuStreamCreate (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned
   return 0;
 }
 
-int hc_cuStreamDestroy (hashcat_ctx_t *hashcat_ctx, CUstream hStream)
+int hc_hipCtxSetCurrent (hashcat_ctx_t *hashcat_ctx, HIPcontext ctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuStreamDestroy (hStream);
+  const HIPresult HIP_err = hip->hipCtxSetCurrent (ctx);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuStreamDestroy(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipCtxSetCurrent(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuStreamDestroy(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipCtxSetCurrent(): %d", HIP_err);
     }
 
     return -1;
@@ -1717,13 +2055,13 @@ int hc_cuStreamDestroy (hashcat_ctx_t *hashcat_ctx, CUstream hStream)
   return 0;
 }
 
-int hc_cuStreamSynchronize (hashcat_ctx_t *hashcat_ctx, CUstream hStream)
+int hc_cuMemAlloc (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuStreamSynchronize (hStream);
+  const CUresult CU_err = cuda->cuMemAlloc (dptr, bytesize);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1731,11 +2069,11 @@ int hc_cuStreamSynchronize (hashcat_ctx_t *hashcat_ctx, CUstream hStream)
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuStreamSynchronize(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuMemAlloc(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuStreamSynchronize(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuMemAlloc(): %d", CU_err);
     }
 
     return -1;
@@ -1744,25 +2082,25 @@ int hc_cuStreamSynchronize (hashcat_ctx_t *hashcat_ctx, CUstream hStream)
   return 0;
 }
 
-int hc_cuLaunchKernel (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra)
+int hc_hipMemAlloc (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr *dptr, size_t bytesize)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuLaunchKernel (f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+  const HIPresult HIP_err = hip->hipMemAlloc (dptr, bytesize);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuLaunchKernel(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipMemAlloc(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuLaunchKernel(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipMemAlloc(): %d", HIP_err);
     }
 
     return -1;
@@ -1771,13 +2109,13 @@ int hc_cuLaunchKernel (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gr
   return 0;
 }
 
-int hc_cuCtxSynchronize (hashcat_ctx_t *hashcat_ctx)
+int hc_cuMemFree (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuCtxSynchronize ();
+  const CUresult CU_err = cuda->cuMemFree (dptr);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1785,11 +2123,11 @@ int hc_cuCtxSynchronize (hashcat_ctx_t *hashcat_ctx)
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuCtxSynchronize(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuMemFree(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuCtxSynchronize(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuMemFree(): %d", CU_err);
     }
 
     return -1;
@@ -1798,25 +2136,25 @@ int hc_cuCtxSynchronize (hashcat_ctx_t *hashcat_ctx)
   return 0;
 }
 
-int hc_cuEventCreate (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags)
+int hc_hipMemFree (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr dptr)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuEventCreate (phEvent, Flags);
+  const HIPresult HIP_err = hip->hipMemFree (dptr);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuEventCreate(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipMemFree(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuEventCreate(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipMemFree(): %d", HIP_err);
     }
 
     return -1;
@@ -1825,13 +2163,13 @@ int hc_cuEventCreate (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int
   return 0;
 }
 
-int hc_cuEventDestroy (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuEventDestroy (hEvent);
+  const CUresult CU_err = cuda->cuMemcpyDtoH (dstHost, srcDevice, ByteCount);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1839,11 +2177,11 @@ int hc_cuEventDestroy (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuEventDestroy(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuEventDestroy(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %d", CU_err);
     }
 
     return -1;
@@ -1852,25 +2190,25 @@ int hc_cuEventDestroy (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
   return 0;
 }
 
-int hc_cuEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd)
+int hc_hipMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, HIPdeviceptr srcDevice, size_t ByteCount)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuEventElapsedTime (pMilliseconds, hStart, hEnd);
+  const HIPresult HIP_err = hip->hipMemcpyDtoH (dstHost, srcDevice, ByteCount);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipMemcpyDtoH(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipMemcpyDtoH(): %d", HIP_err);
     }
 
     return -1;
@@ -1879,13 +2217,13 @@ int hc_cuEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUe
   return 0;
 }
 
-int hc_cuEventQuery (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuEventQuery (hEvent);
+  const CUresult CU_err = cuda->cuMemcpyDtoD (dstDevice, srcDevice, ByteCount);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1893,11 +2231,11 @@ int hc_cuEventQuery (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuEventQuery(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuEventQuery(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %d", CU_err);
     }
 
     return -1;
@@ -1906,25 +2244,25 @@ int hc_cuEventQuery (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
   return 0;
 }
 
-int hc_cuEventRecord (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream)
+int hc_hipMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr dstDevice, HIPdeviceptr srcDevice, size_t ByteCount)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuEventRecord (hEvent, hStream);
+  const HIPresult HIP_err = hip->hipMemcpyDtoD (dstDevice, srcDevice, ByteCount);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuEventRecord(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipMemcpyDtoD(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuEventRecord(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipMemcpyDtoD(): %d", HIP_err);
     }
 
     return -1;
@@ -1933,13 +2271,13 @@ int hc_cuEventRecord (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStre
   return 0;
 }
 
-int hc_cuEventSynchronize (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuEventSynchronize (hEvent);
+  const CUresult CU_err = cuda->cuMemcpyHtoD (dstDevice, srcHost, ByteCount);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -1947,11 +2285,11 @@ int hc_cuEventSynchronize (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuEventSynchronize(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuEventSynchronize(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %d", CU_err);
     }
 
     return -1;
@@ -1960,25 +2298,25 @@ int hc_cuEventSynchronize (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
   return 0;
 }
 
-int hc_cuCtxSetCacheConfig (hashcat_ctx_t *hashcat_ctx, CUfunc_cache config)
+int hc_hipMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuCtxSetCacheConfig (config);
+  const HIPresult HIP_err = hip->hipMemcpyHtoD (dstDevice, srcHost, ByteCount);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipMemcpyHtoD(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipMemcpyHtoD(): %d", HIP_err);
     }
 
     return -1;
@@ -1987,13 +2325,13 @@ int hc_cuCtxSetCacheConfig (hashcat_ctx_t *hashcat_ctx, CUfunc_cache config)
   return 0;
 }
 
-int hc_cuCtxPushCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
+int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuCtxPushCurrent (ctx);
+  const CUresult CU_err = cuda->cuModuleGetFunction (hfunc, hmod, name);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -2001,11 +2339,11 @@ int hc_cuCtxPushCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuCtxPushCurrent(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuModuleGetFunction(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuCtxPushCurrent(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuModuleGetFunction(): %d", CU_err);
     }
 
     return -1;
@@ -2014,25 +2352,25 @@ int hc_cuCtxPushCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
   return 0;
 }
 
-int hc_cuCtxPopCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx)
+int hc_hipModuleGetFunction (hashcat_ctx_t *hashcat_ctx, HIPfunction *hfunc, HIPmodule hmod, const char *name)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuCtxPopCurrent (pctx);
+  const HIPresult HIP_err = hip->hipModuleGetFunction (hfunc, hmod, name);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuCtxPopCurrent(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipModuleGetFunction(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuCtxPopCurrent(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipModuleGetFunction(): %d", HIP_err);
     }
 
     return -1;
@@ -2041,13 +2379,13 @@ int hc_cuCtxPopCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx)
   return 0;
 }
 
-int hc_cuLinkCreate (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut)
+int hc_cuModuleGetGlobal (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuLinkCreate (numOptions, options, optionValues, stateOut);
+  const CUresult CU_err = cuda->cuModuleGetGlobal (dptr, bytes, hmod, name);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -2055,11 +2393,11 @@ int hc_cuLinkCreate (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, CUjit_
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuLinkCreate(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuLinkCreate(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %d", CU_err);
     }
 
     return -1;
@@ -2068,25 +2406,25 @@ int hc_cuLinkCreate (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, CUjit_
   return 0;
 }
 
-int hc_cuLinkAddData (hashcat_ctx_t *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues)
+int hc_hipModuleGetGlobal (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr *dptr, size_t *bytes, HIPmodule hmod, const char *name)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuLinkAddData (state, type, data, size, name, numOptions, options, optionValues);
+  const HIPresult HIP_err = hip->hipModuleGetGlobal (dptr, bytes, hmod, name);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuLinkAddData(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipModuleGetGlobal(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuLinkAddData(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipModuleGetGlobal(): %d", HIP_err);
     }
 
     return -1;
@@ -2095,13 +2433,13 @@ int hc_cuLinkAddData (hashcat_ctx_t *hashcat_ctx, CUlinkState state, CUjitInputT
   return 0;
 }
 
-int hc_cuLinkDestroy (hashcat_ctx_t *hashcat_ctx, CUlinkState state)
+int hc_cuMemGetInfo (hashcat_ctx_t *hashcat_ctx, size_t *free, size_t *total)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const CUresult CU_err = cuda->cuLinkDestroy (state);
+  const CUresult CU_err = cuda->cuMemGetInfo (free, total);
 
   if (CU_err != CUDA_SUCCESS)
   {
@@ -2109,11 +2447,11 @@ int hc_cuLinkDestroy (hashcat_ctx_t *hashcat_ctx, CUlinkState state)
 
     if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuLinkDestroy(): %s", pStr);
+      event_log_error (hashcat_ctx, "cuMemGetInfo(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuLinkDestroy(): %d", CU_err);
+      event_log_error (hashcat_ctx, "cuMemGetInfo(): %d", CU_err);
     }
 
     return -1;
@@ -2122,25 +2460,25 @@ int hc_cuLinkDestroy (hashcat_ctx_t *hashcat_ctx, CUlinkState state)
   return 0;
 }
 
-int hc_cuLinkComplete (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut)
+int hc_hipMemGetInfo (hashcat_ctx_t *hashcat_ctx, size_t *free, size_t *total)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const CUresult CU_err = cuda->cuLinkComplete (state, cubinOut, sizeOut);
+  const HIPresult HIP_err = hip->hipMemGetInfo (free, total);
 
-  if (CU_err != CUDA_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
     const char *pStr = NULL;
 
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      event_log_error (hashcat_ctx, "cuLinkComplete(): %s", pStr);
+      event_log_error (hashcat_ctx, "hipMemGetInfo(): %s", pStr);
     }
     else
     {
-      event_log_error (hashcat_ctx, "cuLinkComplete(): %d", CU_err);
+      event_log_error (hashcat_ctx, "hipMemGetInfo(): %d", HIP_err);
     }
 
     return -1;
@@ -2149,98 +2487,53 @@ int hc_cuLinkComplete (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cub
   return 0;
 }
 
-// OpenCL
-
-int ocl_init (hashcat_ctx_t *hashcat_ctx)
+int hc_cuFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  memset (ocl, 0, sizeof (OCL_PTR));
+  const CUresult CU_err = cuda->cuFuncGetAttribute (pi, attrib, hfunc);
 
-  #if   defined (_WIN)
-  ocl->lib = hc_dlopen ("OpenCL");
-  #elif defined (__APPLE__)
-  ocl->lib = hc_dlopen ("/System/Library/Frameworks/OpenCL.framework/OpenCL");
-  #elif defined (__CYGWIN__)
-  ocl->lib = hc_dlopen ("opencl.dll");
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
 
-  if (ocl->lib == NULL) ocl->lib = hc_dlopen ("cygOpenCL-1.dll");
-  #else
-  ocl->lib = hc_dlopen ("libOpenCL.so");
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuFuncGetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuFuncGetAttribute(): %d", CU_err);
+    }
 
-  if (ocl->lib == NULL) ocl->lib = hc_dlopen ("libOpenCL.so.1");
-  #endif
+    return -1;
+  }
 
-  if (ocl->lib == NULL) return -1;
+  return 0;
+}
 
-  HC_LOAD_FUNC (ocl, clBuildProgram,            OCL_CLBUILDPROGRAM,             OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clCreateBuffer,            OCL_CLCREATEBUFFER,             OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clCreateCommandQueue,      OCL_CLCREATECOMMANDQUEUE,       OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clCreateContext,           OCL_CLCREATECONTEXT,            OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clCreateKernel,            OCL_CLCREATEKERNEL,             OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clCreateProgramWithBinary, OCL_CLCREATEPROGRAMWITHBINARY,  OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clCreateProgramWithSource, OCL_CLCREATEPROGRAMWITHSOURCE,  OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clEnqueueCopyBuffer,       OCL_CLENQUEUECOPYBUFFER,        OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clEnqueueMapBuffer,        OCL_CLENQUEUEMAPBUFFER,         OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clEnqueueNDRangeKernel,    OCL_CLENQUEUENDRANGEKERNEL,     OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clEnqueueReadBuffer,       OCL_CLENQUEUEREADBUFFER,        OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clEnqueueUnmapMemObject,   OCL_CLENQUEUEUNMAPMEMOBJECT,    OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clEnqueueWriteBuffer,      OCL_CLENQUEUEWRITEBUFFER,       OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clFinish,                  OCL_CLFINISH,                   OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clFlush,                   OCL_CLFLUSH,                    OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clGetDeviceIDs,            OCL_CLGETDEVICEIDS,             OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clGetDeviceInfo,           OCL_CLGETDEVICEINFO,            OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clGetEventInfo,            OCL_CLGETEVENTINFO,             OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clGetKernelWorkGroupInfo,  OCL_CLGETKERNELWORKGROUPINFO,   OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clGetPlatformIDs,          OCL_CLGETPLATFORMIDS,           OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clGetPlatformInfo,         OCL_CLGETPLATFORMINFO,          OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clGetProgramBuildInfo,     OCL_CLGETPROGRAMBUILDINFO,      OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clGetProgramInfo,          OCL_CLGETPROGRAMINFO,           OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clReleaseCommandQueue,     OCL_CLRELEASECOMMANDQUEUE,      OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clReleaseContext,          OCL_CLRELEASECONTEXT,           OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clReleaseKernel,           OCL_CLRELEASEKERNEL,            OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clReleaseMemObject,        OCL_CLRELEASEMEMOBJECT,         OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clReleaseProgram,          OCL_CLRELEASEPROGRAM,           OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clSetKernelArg,            OCL_CLSETKERNELARG,             OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clWaitForEvents,           OCL_CLWAITFOREVENTS,            OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clGetEventProfilingInfo,   OCL_CLGETEVENTPROFILINGINFO,    OpenCL, 1);
-  HC_LOAD_FUNC (ocl, clReleaseEvent,            OCL_CLRELEASEEVENT,             OpenCL, 1);
-
-  return 0;
-}
-
-void ocl_close (hashcat_ctx_t *hashcat_ctx)
+int hc_hipFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, HIPfunction_attribute attrib, HIPfunction hfunc)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  if (ocl)
+  const HIPresult HIP_err = hip->hipFuncGetAttribute (pi, attrib, hfunc);
+
+  if (HIP_err != HIP_SUCCESS)
   {
-    if (ocl->lib)
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      hc_dlclose (ocl->lib);
+      event_log_error (hashcat_ctx, "hipFuncGetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipFuncGetAttribute(): %d", HIP_err);
     }
-
-    hcfree (backend_ctx->ocl);
-
-    backend_ctx->ocl = NULL;
-  }
-}
-
-int hc_clEnqueueNDRangeKernel (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
-
-  const cl_int CL_err = ocl->clEnqueueNDRangeKernel (command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
-
-  if (CL_err != CL_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "clEnqueueNDRangeKernel(): %s", val2cstr_cl (CL_err));
 
     return -1;
   }
@@ -2248,17 +2541,26 @@ int hc_clEnqueueNDRangeKernel (hashcat_ctx_t *hashcat_ctx, cl_command_queue comm
   return 0;
 }
 
-int hc_clGetEventInfo (hashcat_ctx_t *hashcat_ctx, cl_event event, cl_event_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+int hc_cuFuncSetAttribute (hashcat_ctx_t *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clGetEventInfo (event, param_name, param_value_size, param_value, param_value_size_ret);
+  const CUresult CU_err = cuda->cuFuncSetAttribute (hfunc, attrib, value);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clGetEventInfo(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2266,17 +2568,26 @@ int hc_clGetEventInfo (hashcat_ctx_t *hashcat_ctx, cl_event event, cl_event_info
   return 0;
 }
 
-int hc_clFlush (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
+int hc_hipFuncSetAttribute (hashcat_ctx_t *hashcat_ctx, HIPfunction hfunc, HIPfunction_attribute attrib, int value)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const cl_int CL_err = ocl->clFlush (command_queue);
+  const HIPresult HIP_err = hip->hipFuncSetAttribute (hfunc, attrib, value);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clFlush(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipFuncSetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipFuncSetAttribute(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2284,17 +2595,26 @@ int hc_clFlush (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
   return 0;
 }
 
-int hc_clFinish (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
+int hc_cuStreamCreate (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clFinish (command_queue);
+  const CUresult CU_err = cuda->cuStreamCreate (phStream, Flags);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clFinish(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuStreamCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuStreamCreate(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2302,17 +2622,26 @@ int hc_clFinish (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
   return 0;
 }
 
-int hc_clSetKernelArg (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value)
+int hc_hipStreamCreate (hashcat_ctx_t *hashcat_ctx, HIPstream *phStream, unsigned int Flags)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const cl_int CL_err = ocl->clSetKernelArg (kernel, arg_index, arg_size, arg_value);
+  const HIPresult HIP_err = hip->hipStreamCreate (phStream, Flags);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clSetKernelArg(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipStreamCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipStreamCreate(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2320,17 +2649,26 @@ int hc_clSetKernelArg (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel, cl_uint arg
   return 0;
 }
 
-int hc_clEnqueueWriteBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t size, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+int hc_cuStreamDestroy (hashcat_ctx_t *hashcat_ctx, CUstream hStream)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clEnqueueWriteBuffer (command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+  const CUresult CU_err = cuda->cuStreamDestroy (hStream);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clEnqueueWriteBuffer(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuStreamDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuStreamDestroy(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2338,17 +2676,26 @@ int hc_clEnqueueWriteBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue comman
   return 0;
 }
 
-int hc_clEnqueueCopyBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+int hc_hipStreamDestroy (hashcat_ctx_t *hashcat_ctx, HIPstream hStream)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const cl_int CL_err = ocl->clEnqueueCopyBuffer (command_queue, src_buffer, dst_buffer, src_offset, dst_offset, size, num_events_in_wait_list, event_wait_list, event);
+  const HIPresult HIP_err = hip->hipStreamDestroy (hStream);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clEnqueueCopyBuffer(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipStreamDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipStreamDestroy(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2356,17 +2703,26 @@ int hc_clEnqueueCopyBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command
   return 0;
 }
 
-int hc_clEnqueueReadBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t size, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+int hc_cuStreamSynchronize (hashcat_ctx_t *hashcat_ctx, CUstream hStream)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clEnqueueReadBuffer (command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+  const CUresult CU_err = cuda->cuStreamSynchronize (hStream);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clEnqueueReadBuffer(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuStreamSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuStreamSynchronize(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2374,17 +2730,26 @@ int hc_clEnqueueReadBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command
   return 0;
 }
 
-int hc_clGetPlatformIDs (hashcat_ctx_t *hashcat_ctx, cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms)
+int hc_hipStreamSynchronize (hashcat_ctx_t *hashcat_ctx, HIPstream hStream)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const cl_int CL_err = ocl->clGetPlatformIDs (num_entries, platforms, num_platforms);
+  const HIPresult HIP_err = hip->hipStreamSynchronize (hStream);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clGetPlatformIDs(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipStreamSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipStreamSynchronize(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2392,17 +2757,26 @@ int hc_clGetPlatformIDs (hashcat_ctx_t *hashcat_ctx, cl_uint num_entries, cl_pla
   return 0;
 }
 
-int hc_clGetPlatformInfo (hashcat_ctx_t *hashcat_ctx, cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+int hc_cuLaunchKernel (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clGetPlatformInfo (platform, param_name, param_value_size, param_value, param_value_size_ret);
+  const CUresult CU_err = cuda->cuLaunchKernel (f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clGetPlatformInfo(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLaunchKernel(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLaunchKernel(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2410,17 +2784,26 @@ int hc_clGetPlatformInfo (hashcat_ctx_t *hashcat_ctx, cl_platform_id platform, c
   return 0;
 }
 
-int hc_clGetDeviceIDs (hashcat_ctx_t *hashcat_ctx, cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices)
+int hc_hipLaunchKernel (hashcat_ctx_t *hashcat_ctx, HIPfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, HIPstream hStream, void **kernelParams, void **extra)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const cl_int CL_err = ocl->clGetDeviceIDs (platform, device_type, num_entries, devices, num_devices);
+  const HIPresult HIP_err = hip->hipLaunchKernel (f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clGetDeviceIDs(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipLaunchKernel(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipLaunchKernel(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2428,17 +2811,26 @@ int hc_clGetDeviceIDs (hashcat_ctx_t *hashcat_ctx, cl_platform_id platform, cl_d
   return 0;
 }
 
-int hc_clGetDeviceInfo (hashcat_ctx_t *hashcat_ctx, cl_device_id device, cl_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+int hc_cuCtxSynchronize (hashcat_ctx_t *hashcat_ctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clGetDeviceInfo (device, param_name, param_value_size, param_value, param_value_size_ret);
+  const CUresult CU_err = cuda->cuCtxSynchronize ();
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clGetDeviceInfo(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxSynchronize(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2446,19 +2838,26 @@ int hc_clGetDeviceInfo (hashcat_ctx_t *hashcat_ctx, cl_device_id device, cl_devi
   return 0;
 }
 
-int hc_clCreateContext (hashcat_ctx_t *hashcat_ctx, const cl_context_properties *properties, cl_uint num_devices, const cl_device_id *devices, void (CL_CALLBACK *pfn_notify) (const char *errinfo, const void *private_info, size_t cb, void *user_data), void *user_data, cl_context *context)
+int hc_hipCtxSynchronize (hashcat_ctx_t *hashcat_ctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
-
-  cl_int CL_err;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  *context = ocl->clCreateContext (properties, num_devices, devices, pfn_notify, user_data, &CL_err);
+  const HIPresult HIP_err = hip->hipCtxSynchronize ();
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clCreateContext(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipCtxSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipCtxSynchronize(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2466,19 +2865,26 @@ int hc_clCreateContext (hashcat_ctx_t *hashcat_ctx, const cl_context_properties
   return 0;
 }
 
-int hc_clCreateCommandQueue (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_device_id device, cl_command_queue_properties properties, cl_command_queue *command_queue)
+int hc_cuEventCreate (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  cl_int CL_err;
-
-  *command_queue = ocl->clCreateCommandQueue (context, device, properties, &CL_err);
+  const CUresult CU_err = cuda->cuEventCreate (phEvent, Flags);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clCreateCommandQueue(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventCreate(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2486,19 +2892,26 @@ int hc_clCreateCommandQueue (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_
   return 0;
 }
 
-int hc_clCreateBuffer (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_mem *mem)
+int hc_hipEventCreate (hashcat_ctx_t *hashcat_ctx, HIPevent *phEvent, unsigned int Flags)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
-
-  cl_int CL_err;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  *mem = ocl->clCreateBuffer (context, flags, size, host_ptr, &CL_err);
+  const HIPresult HIP_err = hip->hipEventCreate (phEvent, Flags);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clCreateBuffer(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipEventCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipEventCreate(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2506,19 +2919,26 @@ int hc_clCreateBuffer (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_fl
   return 0;
 }
 
-int hc_clCreateProgramWithSource (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_program *program)
+int hc_cuEventDestroy (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
-
-  cl_int CL_err;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  *program = ocl->clCreateProgramWithSource (context, count, strings, lengths, &CL_err);
+  const CUresult CU_err = cuda->cuEventDestroy (hEvent);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clCreateProgramWithSource(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventDestroy(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2526,19 +2946,26 @@ int hc_clCreateProgramWithSource (hashcat_ctx_t *hashcat_ctx, cl_context context
   return 0;
 }
 
-int hc_clCreateProgramWithBinary (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_uint num_devices, const cl_device_id *device_list, const size_t *lengths, const unsigned char **binaries, cl_int *binary_status, cl_program *program)
+int hc_hipEventDestroy (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
-
-  cl_int CL_err;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  *program = ocl->clCreateProgramWithBinary (context, num_devices, device_list, lengths, binaries, binary_status, &CL_err);
+  const HIPresult HIP_err = hip->hipEventDestroy (hEvent);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clCreateProgramWithBinary(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipEventDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipEventDestroy(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2546,17 +2973,26 @@ int hc_clCreateProgramWithBinary (hashcat_ctx_t *hashcat_ctx, cl_context context
   return 0;
 }
 
-int hc_clBuildProgram (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data)
+int hc_cuEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clBuildProgram (program, num_devices, device_list, options, pfn_notify, user_data);
+  const CUresult CU_err = cuda->cuEventElapsedTime (pMilliseconds, hStart, hEnd);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clBuildProgram(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2564,19 +3000,26 @@ int hc_clBuildProgram (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint n
   return 0;
 }
 
-int hc_clCreateKernel (hashcat_ctx_t *hashcat_ctx, cl_program program, const char *kernel_name, cl_kernel *kernel)
+int hc_hipEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, HIPevent hStart, HIPevent hEnd)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
-
-  cl_int CL_err;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  *kernel = ocl->clCreateKernel (program, kernel_name, &CL_err);
+  const HIPresult HIP_err = hip->hipEventElapsedTime (pMilliseconds, hStart, hEnd);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clCreateKernel(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipEventElapsedTime(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipEventElapsedTime(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2584,17 +3027,26 @@ int hc_clCreateKernel (hashcat_ctx_t *hashcat_ctx, cl_program program, const cha
   return 0;
 }
 
-int hc_clReleaseMemObject (hashcat_ctx_t *hashcat_ctx, cl_mem mem)
+int hc_cuEventQuery (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clReleaseMemObject (mem);
+  const CUresult CU_err = cuda->cuEventQuery (hEvent);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clReleaseMemObject(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventQuery(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventQuery(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2602,17 +3054,26 @@ int hc_clReleaseMemObject (hashcat_ctx_t *hashcat_ctx, cl_mem mem)
   return 0;
 }
 
-int hc_clReleaseKernel (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel)
+int hc_hipEventQuery (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const cl_int CL_err = ocl->clReleaseKernel (kernel);
+  const HIPresult HIP_err = hip->hipEventQuery (hEvent);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clReleaseKernel(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipEventQuery(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipEventQuery(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2620,17 +3081,26 @@ int hc_clReleaseKernel (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel)
   return 0;
 }
 
-int hc_clReleaseProgram (hashcat_ctx_t *hashcat_ctx, cl_program program)
+int hc_cuEventRecord (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clReleaseProgram (program);
+  const CUresult CU_err = cuda->cuEventRecord (hEvent, hStream);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clReleaseProgram(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventRecord(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventRecord(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2638,17 +3108,26 @@ int hc_clReleaseProgram (hashcat_ctx_t *hashcat_ctx, cl_program program)
   return 0;
 }
 
-int hc_clReleaseCommandQueue (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
+int hc_hipEventRecord (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent, HIPstream hStream)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const cl_int CL_err = ocl->clReleaseCommandQueue (command_queue);
+  const HIPresult HIP_err = hip->hipEventRecord (hEvent, hStream);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clReleaseCommandQueue(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipEventRecord(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipEventRecord(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2656,17 +3135,26 @@ int hc_clReleaseCommandQueue (hashcat_ctx_t *hashcat_ctx, cl_command_queue comma
   return 0;
 }
 
-int hc_clReleaseContext (hashcat_ctx_t *hashcat_ctx, cl_context context)
+int hc_cuEventSynchronize (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clReleaseContext (context);
+  const CUresult CU_err = cuda->cuEventSynchronize (hEvent);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clReleaseContext(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventSynchronize(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2674,19 +3162,26 @@ int hc_clReleaseContext (hashcat_ctx_t *hashcat_ctx, cl_context context)
   return 0;
 }
 
-int hc_clEnqueueMapBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event, void **buf)
+int hc_hipEventSynchronize (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
-
-  cl_int CL_err;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  *buf = ocl->clEnqueueMapBuffer (command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, &CL_err);
+  const HIPresult HIP_err = hip->hipEventSynchronize (hEvent);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clEnqueueMapBuffer(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipEventSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipEventSynchronize(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2694,17 +3189,26 @@ int hc_clEnqueueMapBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_
   return 0;
 }
 
-int hc_clEnqueueUnmapMemObject (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+int hc_cuCtxSetCacheConfig (hashcat_ctx_t *hashcat_ctx, CUfunc_cache config)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clEnqueueUnmapMemObject (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
+  const CUresult CU_err = cuda->cuCtxSetCacheConfig (config);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clEnqueueUnmapMemObject(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2712,17 +3216,26 @@ int hc_clEnqueueUnmapMemObject (hashcat_ctx_t *hashcat_ctx, cl_command_queue com
   return 0;
 }
 
-int hc_clGetKernelWorkGroupInfo (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+int hc_hipCtxSetCacheConfig (hashcat_ctx_t *hashcat_ctx, HIPfunc_cache config)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const cl_int CL_err = ocl->clGetKernelWorkGroupInfo (kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
+  const HIPresult HIP_err = hip->hipCtxSetCacheConfig (config);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clGetKernelWorkGroupInfo(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipCtxSetCacheConfig(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipCtxSetCacheConfig(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2730,17 +3243,26 @@ int hc_clGetKernelWorkGroupInfo (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel, c
   return 0;
 }
 
-int hc_clGetProgramBuildInfo (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_device_id device, cl_program_build_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+int hc_cuCtxPushCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clGetProgramBuildInfo (program, device, param_name, param_value_size, param_value, param_value_size_ret);
+  const CUresult CU_err = cuda->cuCtxPushCurrent (ctx);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clGetProgramBuildInfo(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxPushCurrent(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxPushCurrent(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2748,17 +3270,26 @@ int hc_clGetProgramBuildInfo (hashcat_ctx_t *hashcat_ctx, cl_program program, cl
   return 0;
 }
 
-int hc_clGetProgramInfo (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_program_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+int hc_hipCtxPushCurrent (hashcat_ctx_t *hashcat_ctx, HIPcontext ctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const cl_int CL_err = ocl->clGetProgramInfo (program, param_name, param_value_size, param_value, param_value_size_ret);
+  const HIPresult HIP_err = hip->hipCtxPushCurrent (ctx);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clGetProgramInfo(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipCtxPushCurrent(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipCtxPushCurrent(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2766,17 +3297,26 @@ int hc_clGetProgramInfo (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_prog
   return 0;
 }
 
-int hc_clWaitForEvents (hashcat_ctx_t *hashcat_ctx, cl_uint num_events, const cl_event *event_list)
+int hc_cuCtxPopCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clWaitForEvents (num_events, event_list);
+  const CUresult CU_err = cuda->cuCtxPopCurrent (pctx);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clWaitForEvents(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxPopCurrent(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxPopCurrent(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2784,17 +3324,26 @@ int hc_clWaitForEvents (hashcat_ctx_t *hashcat_ctx, cl_uint num_events, const cl
   return 0;
 }
 
-int hc_clGetEventProfilingInfo (hashcat_ctx_t *hashcat_ctx, cl_event event, cl_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+int hc_hipCtxPopCurrent (hashcat_ctx_t *hashcat_ctx, HIPcontext *pctx)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const cl_int CL_err = ocl->clGetEventProfilingInfo (event, param_name, param_value_size, param_value, param_value_size_ret);
+  const HIPresult HIP_err = hip->hipCtxPopCurrent (pctx);
 
-  if (CL_err != CL_SUCCESS)
+  if (HIP_err != HIP_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clGetEventProfilingInfo(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipCtxPopCurrent(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipCtxPopCurrent(): %d", HIP_err);
+    }
 
     return -1;
   }
@@ -2802,17 +3351,26 @@ int hc_clGetEventProfilingInfo (hashcat_ctx_t *hashcat_ctx, cl_event event, cl_p
   return 0;
 }
 
-int hc_clReleaseEvent (hashcat_ctx_t *hashcat_ctx, cl_event event)
+int hc_cuLinkCreate (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-  const cl_int CL_err = ocl->clReleaseEvent (event);
+  const CUresult CU_err = cuda->cuLinkCreate (numOptions, options, optionValues, stateOut);
 
-  if (CL_err != CL_SUCCESS)
+  if (CU_err != CUDA_SUCCESS)
   {
-    event_log_error (hashcat_ctx, "clReleaseEvent(): %s", val2cstr_cl (CL_err));
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLinkCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLinkCreate(): %d", CU_err);
+    }
 
     return -1;
   }
@@ -2820,5998 +3378,8899 @@ int hc_clReleaseEvent (hashcat_ctx_t *hashcat_ctx, cl_event event)
   return 0;
 }
 
-// Backend
-
-int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 gidd, pw_t *pw)
+int hc_hipLinkCreate (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, HIPjit_option *options, void **optionValues, HIPlinkState *stateOut)
 {
-  pw_idx_t pw_idx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  pw_idx.off = 0;
-  pw_idx.cnt = 0;
-  pw_idx.len = 0;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  if (device_param->is_cuda == true)
-  {
-    if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1) return -1;
+  const HIPresult HIP_err = hip->hipLinkCreate (numOptions, options, optionValues, stateOut);
 
-    if (hc_cuMemcpyDtoH (hashcat_ctx, &pw_idx, device_param->cuda_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t)) == -1) return -1;
+  if (HIP_err != HIP_SUCCESS)
+  {
+    const char *pStr = NULL;
 
-    if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return -1;
-  }
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipLinkCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipLinkCreate(): %d", HIP_err);
+    }
 
-  if (device_param->is_opencl == true)
-  {
-    if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, gidd * sizeof (pw_idx_t), sizeof (pw_idx_t), &pw_idx, 0, NULL, NULL) == -1) return -1;
+    return -1;
   }
 
-  const u32 off = pw_idx.off;
-  const u32 cnt = pw_idx.cnt;
-  const u32 len = pw_idx.len;
+  return 0;
+}
 
-  if (device_param->is_cuda == true)
-  {
-    if (cnt > 0)
-    {
-      if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1) return -1;
+int hc_cuLinkAddData (hashcat_ctx_t *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-      if (hc_cuMemcpyDtoH (hashcat_ctx,pw->i, device_param->cuda_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-      if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return -1;
-    }
-  }
+  const CUresult CU_err = cuda->cuLinkAddData (state, type, data, size, name, numOptions, options, optionValues);
 
-  if (device_param->is_opencl == true)
+  if (CU_err != CUDA_SUCCESS)
   {
-    if (cnt > 0)
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, off * sizeof (u32), cnt * sizeof (u32), pw->i, 0, NULL, NULL) == -1) return -1;
+      event_log_error (hashcat_ctx, "cuLinkAddData(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLinkAddData(): %d", CU_err);
     }
-  }
 
-  for (u32 i = cnt; i < 64; i++)
-  {
-    pw->i[i] = 0;
+    return -1;
   }
 
-  pw->pw_len = len;
-
   return 0;
 }
 
-int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 highest_pw_len, const u64 pws_cnt, const u32 fast_iteration, const u32 salt_pos)
+int hc_hipLinkAddData (hashcat_ctx_t *hashcat_ctx, HIPlinkState state, HIPjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, HIPjit_option *options, void **optionValues)
 {
-  hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
-  hashes_t       *hashes       = hashcat_ctx->hashes;
-  module_ctx_t   *module_ctx   = hashcat_ctx->module_ctx;
-  status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
-  user_options_t *user_options = hashcat_ctx->user_options;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  if (user_options->stdout_flag == true)
-  {
-    return process_stdout (hashcat_ctx, device_param, pws_cnt);
-  }
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+  const HIPresult HIP_err = hip->hipLinkAddData (state, type, data, size, name, numOptions, options, optionValues);
+
+  if (HIP_err != HIP_SUCCESS)
   {
-    if (user_options->attack_mode == ATTACK_MODE_BF)
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
     {
-      if (user_options->slow_candidates == true)
-      {
-      }
-      else
-      {
-        if (hashconfig->opts_type & OPTS_TYPE_TM_KERNEL)
-        {
-          const u32 size_tm = device_param->size_tm;
+      event_log_error (hashcat_ctx, "hipLinkAddData(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipLinkAddData(): %d", HIP_err);
+    }
 
-          if (device_param->is_cuda == true)
-          {
-            if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tm_c, size_tm) == -1) return -1;
-          }
+    return -1;
+  }
 
-          if (device_param->is_opencl == true)
-          {
-            if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_tm_c, size_tm) == -1) return -1;
-          }
+  return 0;
+}
 
-          if (run_kernel_tm (hashcat_ctx, device_param) == -1) return -1;
+int hc_cuLinkDestroy (hashcat_ctx_t *hashcat_ctx, CUlinkState state)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-          if (device_param->is_cuda == true)
-          {
-            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_tm_c, size_tm) == -1) return -1;
-          }
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-          if (device_param->is_opencl == true)
-          {
-            if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tm_c, device_param->opencl_d_bfs_c, 0, 0, size_tm, 0, NULL, NULL) == -1) return -1;
-          }
-        }
-      }
-    }
+  const CUresult CU_err = cuda->cuLinkDestroy (state);
 
-    if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
     {
-      if (highest_pw_len < 16)
-      {
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_1, pws_cnt, true, fast_iteration) == -1) return -1;
-      }
-      else if (highest_pw_len < 32)
-      {
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_cnt, true, fast_iteration) == -1) return -1;
-      }
-      else
-      {
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_3, pws_cnt, true, fast_iteration) == -1) return -1;
-      }
+      event_log_error (hashcat_ctx, "cuLinkDestroy(): %s", pStr);
     }
     else
     {
-      if (run_kernel (hashcat_ctx, device_param, KERN_RUN_4, pws_cnt, true, fast_iteration) == -1) return -1;
+      event_log_error (hashcat_ctx, "cuLinkDestroy(): %d", CU_err);
     }
-  }
-  else
-  {
-    bool run_init = true;
-    bool run_loop = true;
-    bool run_comp = true;
 
-    if (run_init == true)
-    {
-      if (device_param->is_cuda == true)
-      {
-        if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_pws_buf, device_param->cuda_d_pws_amp_buf, pws_cnt * sizeof (pw_t)) == -1) return -1;
-      }
+    return -1;
+  }
 
-      if (device_param->is_opencl == true)
-      {
-        if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_amp_buf, device_param->opencl_d_pws_buf, 0, 0, pws_cnt * sizeof (pw_t), 0, NULL, NULL) == -1) return -1;
-      }
+  return 0;
+}
 
-      if (user_options->slow_candidates == true)
-      {
-      }
-      else
-      {
-        if (run_kernel_amp (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
-      }
+int hc_hipLinkDestroy (hashcat_ctx_t *hashcat_ctx, HIPlinkState state)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-      if (run_kernel (hashcat_ctx, device_param, KERN_RUN_1, pws_cnt, false, 0) == -1) return -1;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-      if (hashconfig->opts_type & OPTS_TYPE_HOOK12)
-      {
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_12, pws_cnt, false, 0) == -1) return -1;
+  const HIPresult HIP_err = hip->hipLinkDestroy (state);
 
-        if (device_param->is_cuda == true)
-        {
-          if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
-        }
+  if (HIP_err != HIP_SUCCESS)
+  {
+    const char *pStr = NULL;
 
-        if (device_param->is_opencl == true)
-        {
-          if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
-        }
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipLinkDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipLinkDestroy(): %d", HIP_err);
+    }
 
-        const int hook_threads = (int) user_options->hook_threads;
+    return -1;
+  }
 
-        hook_thread_param_t *hook_threads_param = (hook_thread_param_t *) hccalloc (hook_threads, sizeof (hook_thread_param_t));
+  return 0;
+}
 
-        for (int i = 0; i < hook_threads; i++)
-        {
-          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
+int hc_cuLinkComplete (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-          hook_thread_param->tid = i;
-          hook_thread_param->tsz = hook_threads;
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
 
-          hook_thread_param->module_ctx = module_ctx;
-          hook_thread_param->status_ctx = status_ctx;
+  const CUresult CU_err = cuda->cuLinkComplete (state, cubinOut, sizeOut);
 
-          hook_thread_param->device_param = device_param;
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
 
-          hook_thread_param->hook_salts_buf = hashes->hook_salts_buf;
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLinkComplete(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLinkComplete(): %d", CU_err);
+    }
 
-          hook_thread_param->salt_pos = salt_pos;
+    return -1;
+  }
 
-          hook_thread_param->pws_cnt = pws_cnt;
-        }
+  return 0;
+}
 
-        hc_thread_t *c_threads = (hc_thread_t *) hccalloc (hook_threads, sizeof (hc_thread_t));
+int hc_hipLinkComplete (hashcat_ctx_t *hashcat_ctx, HIPlinkState state, void **hipbinOut, size_t *sizeOut)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-        for (int i = 0; i < hook_threads; i++)
-        {
-          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-          hc_thread_create (c_threads[i], hook12_thread, hook_thread_param);
-        }
+  const HIPresult HIP_err = hip->hipLinkComplete (state, hipbinOut, sizeOut);
 
-        hc_thread_wait (hook_threads, c_threads);
+  if (HIP_err != HIP_SUCCESS)
+  {
+    const char *pStr = NULL;
 
-        hcfree (c_threads);
+    if (hip->hipGetErrorString (HIP_err, &pStr) == HIP_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "hipLinkComplete(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipLinkComplete(): %d", HIP_err);
+    }
 
-        hcfree (hook_threads_param);
+    return -1;
+  }
 
-        if (device_param->is_cuda == true)
-        {
-          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
-        }
+  return 0;
+}
 
-        if (device_param->is_opencl == true)
-        {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
-        }
-      }
-    }
+// OpenCL
 
-    if (run_loop == true)
-    {
-      u32 iter = hashes->salts_buf[salt_pos].salt_iter;
+int ocl_init (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-      u32 loop_step = device_param->kernel_loops;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-      for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
-      {
-        u32 loop_left = iter - loop_pos;
+  memset (ocl, 0, sizeof (OCL_PTR));
 
-        loop_left = MIN (loop_left, loop_step);
+  #if   defined (_WIN)
+  ocl->lib = hc_dlopen ("OpenCL");
+  #elif defined (__APPLE__)
+  ocl->lib = hc_dlopen ("/System/Library/Frameworks/OpenCL.framework/OpenCL");
+  #elif defined (__CYGWIN__)
+  ocl->lib = hc_dlopen ("opencl.dll");
 
-        device_param->kernel_params_buf32[28] = loop_pos;
-        device_param->kernel_params_buf32[29] = loop_left;
+  if (ocl->lib == NULL) ocl->lib = hc_dlopen ("cygOpenCL-1.dll");
+  #else
+  ocl->lib = hc_dlopen ("libOpenCL.so");
 
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_cnt, true, slow_iteration) == -1) return -1;
+  if (ocl->lib == NULL) ocl->lib = hc_dlopen ("libOpenCL.so.1");
+  #endif
 
-        if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
-        {
-          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2E, pws_cnt, true, slow_iteration) == -1) return -1;
-        }
+  if (ocl->lib == NULL) return -1;
 
-        //bug?
-        //while (status_ctx->run_thread_level2 == false) break;
-        if (status_ctx->run_thread_level2 == false) break;
+  HC_LOAD_FUNC (ocl, clBuildProgram,            OCL_CLBUILDPROGRAM,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateBuffer,            OCL_CLCREATEBUFFER,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateCommandQueue,      OCL_CLCREATECOMMANDQUEUE,       OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateContext,           OCL_CLCREATECONTEXT,            OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateKernel,            OCL_CLCREATEKERNEL,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateProgramWithBinary, OCL_CLCREATEPROGRAMWITHBINARY,  OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clCreateProgramWithSource, OCL_CLCREATEPROGRAMWITHSOURCE,  OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueCopyBuffer,       OCL_CLENQUEUECOPYBUFFER,        OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueMapBuffer,        OCL_CLENQUEUEMAPBUFFER,         OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueNDRangeKernel,    OCL_CLENQUEUENDRANGEKERNEL,     OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueReadBuffer,       OCL_CLENQUEUEREADBUFFER,        OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueUnmapMemObject,   OCL_CLENQUEUEUNMAPMEMOBJECT,    OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clEnqueueWriteBuffer,      OCL_CLENQUEUEWRITEBUFFER,       OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clFinish,                  OCL_CLFINISH,                   OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clFlush,                   OCL_CLFLUSH,                    OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetDeviceIDs,            OCL_CLGETDEVICEIDS,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetDeviceInfo,           OCL_CLGETDEVICEINFO,            OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetEventInfo,            OCL_CLGETEVENTINFO,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetKernelWorkGroupInfo,  OCL_CLGETKERNELWORKGROUPINFO,   OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetPlatformIDs,          OCL_CLGETPLATFORMIDS,           OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetPlatformInfo,         OCL_CLGETPLATFORMINFO,          OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetProgramBuildInfo,     OCL_CLGETPROGRAMBUILDINFO,      OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetProgramInfo,          OCL_CLGETPROGRAMINFO,           OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseCommandQueue,     OCL_CLRELEASECOMMANDQUEUE,      OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseContext,          OCL_CLRELEASECONTEXT,           OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseKernel,           OCL_CLRELEASEKERNEL,            OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseMemObject,        OCL_CLRELEASEMEMOBJECT,         OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseProgram,          OCL_CLRELEASEPROGRAM,           OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clSetKernelArg,            OCL_CLSETKERNELARG,             OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clWaitForEvents,           OCL_CLWAITFOREVENTS,            OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clGetEventProfilingInfo,   OCL_CLGETEVENTPROFILINGINFO,    OpenCL, 1);
+  HC_LOAD_FUNC (ocl, clReleaseEvent,            OCL_CLRELEASEEVENT,             OpenCL, 1);
 
-        /**
-         * speed
-         */
+  return 0;
+}
 
-        const float iter_part = (float) (loop_pos + loop_left) / iter;
+void ocl_close (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-        const u64 perf_sum_all = (u64) (pws_cnt * iter_part);
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-        double speed_msec = hc_timer_get (device_param->timer_speed);
+  if (ocl)
+  {
+    if (ocl->lib)
+    {
+      hc_dlclose (ocl->lib);
+    }
 
-        const u32 speed_pos = device_param->speed_pos;
+    hcfree (backend_ctx->ocl);
 
-        device_param->speed_cnt[speed_pos] = perf_sum_all;
+    backend_ctx->ocl = NULL;
+  }
+}
 
-        device_param->speed_msec[speed_pos] = speed_msec;
+int hc_clEnqueueNDRangeKernel (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-        if (user_options->speed_only == true)
-        {
-          if (speed_msec > 4000)
-          {
-            device_param->outerloop_multi *= (double) iter / (double) (loop_pos + loop_left);
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-            device_param->speed_pos = 1;
+  const cl_int CL_err = ocl->clEnqueueNDRangeKernel (command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
 
-            device_param->speed_only_finish = true;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueNDRangeKernel(): %s", val2cstr_cl (CL_err));
 
-            return 0;
-          }
-        }
-      }
+    return -1;
+  }
 
-      if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
-      {
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_23, pws_cnt, false, 0) == -1) return -1;
+  return 0;
+}
 
-        if (device_param->is_cuda == true)
-        {
-          if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
-        }
+int hc_clGetEventInfo (hashcat_ctx_t *hashcat_ctx, cl_event event, cl_event_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-        if (device_param->is_opencl == true)
-        {
-          if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
-        }
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-        const int hook_threads = (int) user_options->hook_threads;
+  const cl_int CL_err = ocl->clGetEventInfo (event, param_name, param_value_size, param_value, param_value_size_ret);
 
-        hook_thread_param_t *hook_threads_param = (hook_thread_param_t *) hccalloc (hook_threads, sizeof (hook_thread_param_t));
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetEventInfo(): %s", val2cstr_cl (CL_err));
 
-        for (int i = 0; i < hook_threads; i++)
-        {
-          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
+    return -1;
+  }
 
-          hook_thread_param->tid = i;
-          hook_thread_param->tsz = hook_threads;
+  return 0;
+}
 
-          hook_thread_param->module_ctx = module_ctx;
-          hook_thread_param->status_ctx = status_ctx;
+int hc_clFlush (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-          hook_thread_param->device_param = device_param;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-          hook_thread_param->hook_salts_buf = hashes->hook_salts_buf;
+  const cl_int CL_err = ocl->clFlush (command_queue);
 
-          hook_thread_param->salt_pos = salt_pos;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clFlush(): %s", val2cstr_cl (CL_err));
 
-          hook_thread_param->pws_cnt = pws_cnt;
-        }
+    return -1;
+  }
 
-        hc_thread_t *c_threads = (hc_thread_t *) hccalloc (hook_threads, sizeof (hc_thread_t));
+  return 0;
+}
 
-        for (int i = 0; i < hook_threads; i++)
-        {
-          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
+int hc_clFinish (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-          hc_thread_create (c_threads[i], hook23_thread, hook_thread_param);
-        }
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-        hc_thread_wait (hook_threads, c_threads);
+  const cl_int CL_err = ocl->clFinish (command_queue);
 
-        hcfree (c_threads);
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clFinish(): %s", val2cstr_cl (CL_err));
 
-        hcfree (hook_threads_param);
+    return -1;
+  }
 
-        if (device_param->is_cuda == true)
-        {
-          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
-        }
+  return 0;
+}
 
-        if (device_param->is_opencl == true)
-        {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
-        }
-      }
-    }
+int hc_clSetKernelArg (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    // init2 and loop2 are kind of special, we use run_loop for them, too
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    if (run_loop == true)
-    {
-      // note: they also do not influence the performance screen
-      // in case you want to use this, this cane make sense only if your input data comes out of tmps[]
+  const cl_int CL_err = ocl->clSetKernelArg (kernel, arg_index, arg_size, arg_value);
 
-      if (hashconfig->opts_type & OPTS_TYPE_INIT2)
-      {
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_INIT2, pws_cnt, false, 0) == -1) return -1;
-      }
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clSetKernelArg(): %s", val2cstr_cl (CL_err));
 
-      if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
-      {
-        u32 iter = hashes->salts_buf[salt_pos].salt_iter2;
+    return -1;
+  }
 
-        u32 loop_step = device_param->kernel_loops;
+  return 0;
+}
 
-        for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
-        {
-          u32 loop_left = iter - loop_pos;
+int hc_clEnqueueWriteBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t size, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-          loop_left = MIN (loop_left, loop_step);
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-          device_param->kernel_params_buf32[28] = loop_pos;
-          device_param->kernel_params_buf32[29] = loop_left;
+  const cl_int CL_err = ocl->clEnqueueWriteBuffer (command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
 
-          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2, pws_cnt, true, slow_iteration) == -1) return -1;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueWriteBuffer(): %s", val2cstr_cl (CL_err));
 
-          //bug?
-          //while (status_ctx->run_thread_level2 == false) break;
-          if (status_ctx->run_thread_level2 == false) break;
-        }
-      }
-    }
+    return -1;
+  }
 
-    if (run_comp == true)
-    {
-      if (hashconfig->opts_type & OPTS_TYPE_DEEP_COMP_KERNEL)
-      {
-        const u32 loops_cnt = hashes->salts_buf[salt_pos].digests_cnt;
+  return 0;
+}
 
-        for (u32 loops_pos = 0; loops_pos < loops_cnt; loops_pos++)
-        {
-          device_param->kernel_params_buf32[28] = loops_pos;
-          device_param->kernel_params_buf32[29] = loops_cnt;
+int hc_clEnqueueCopyBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-          const u32 deep_comp_kernel = module_ctx->module_deep_comp_kernel (hashes, salt_pos, loops_pos);
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-          if (run_kernel (hashcat_ctx, device_param, deep_comp_kernel, pws_cnt, false, 0) == -1) return -1;
+  const cl_int CL_err = ocl->clEnqueueCopyBuffer (command_queue, src_buffer, dst_buffer, src_offset, dst_offset, size, num_events_in_wait_list, event_wait_list, event);
 
-          if (status_ctx->run_thread_level2 == false) break;
-        }
-      }
-      else
-      {
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_3, pws_cnt, false, 0) == -1) return -1;
-      }
-    }
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueCopyBuffer(): %s", val2cstr_cl (CL_err));
 
-    /*
-     * maybe we should add this zero of temporary buffers
-     * however it drops the performance from 7055338 to 7010621
+    return -1;
+  }
 
-    if (device_param->is_cuda == true)
-    {
-      if (run_cuda_kernel_bzero   (hashcat_ctx, device_param, device_param->cuda_d_tmps,   device_param->size_tmps) == -1) return -1;
-    }
+  return 0;
+}
 
-    if (device_param->is_opencl == true)
-    {
-      if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_tmps, device_param->size_tmps) == -1) return -1;
-    }
-    */
+int hc_clEnqueueReadBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t size, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    if ((hashconfig->opts_type & OPTS_TYPE_HOOK12) || (hashconfig->opts_type & OPTS_TYPE_HOOK23))
-    {
-      if (device_param->is_cuda == true)
-      {
-        if (run_cuda_kernel_bzero   (hashcat_ctx, device_param, device_param->cuda_d_hooks,   pws_cnt * hashconfig->hook_size) == -1) return -1;
-      }
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-      if (device_param->is_opencl == true)
-      {
-        if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
-      }
-    }
+  const cl_int CL_err = ocl->clEnqueueReadBuffer (command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueReadBuffer(): %s", val2cstr_cl (CL_err));
+
+    return -1;
   }
 
   return 0;
 }
 
-void rebuild_pws_compressed_append (hc_device_param_t *device_param, const u64 pws_cnt, const u8 chr)
+int hc_clGetPlatformIDs (hashcat_ctx_t *hashcat_ctx, cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms)
 {
-  // this function is used if we have to modify the compressed pws buffer in order to
-  // append some data to each password candidate
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  u32      *tmp_pws_comp = (u32 *)      hcmalloc (device_param->size_pws_comp);
-  pw_idx_t *tmp_pws_idx  = (pw_idx_t *) hcmalloc (device_param->size_pws_idx);
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-  for (u32 i = 0; i < pws_cnt; i++)
-  {
-    pw_idx_t *pw_idx_src = device_param->pws_idx + i;
-    pw_idx_t *pw_idx_dst = tmp_pws_idx + i;
+  const cl_int CL_err = ocl->clGetPlatformIDs (num_entries, platforms, num_platforms);
 
-    const u32 src_off = pw_idx_src->off;
-    const u32 src_len = pw_idx_src->len;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetPlatformIDs(): %s", val2cstr_cl (CL_err));
 
-    u8 buf[256];
+    return -1;
+  }
 
-    memcpy (buf, device_param->pws_comp + src_off, src_len);
+  return 0;
+}
 
-    buf[src_len] = chr;
+int hc_clGetPlatformInfo (hashcat_ctx_t *hashcat_ctx, cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    const u32 dst_len = src_len + 1;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    const u32 dst_pw_len4 = (dst_len + 3) & ~3; // round up to multiple of 4
+  const cl_int CL_err = ocl->clGetPlatformInfo (platform, param_name, param_value_size, param_value, param_value_size_ret);
 
-    const u32 dst_pw_len4_cnt = dst_pw_len4 / 4;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetPlatformInfo(): %s", val2cstr_cl (CL_err));
 
-    pw_idx_dst->cnt = dst_pw_len4_cnt;
-    pw_idx_dst->len = src_len; // this is intenionally! src_len can not be dst_len, we dont want the kernel to think 0x80 is part of the password
+    return -1;
+  }
 
-    u8 *dst = (u8 *) (tmp_pws_comp + pw_idx_dst->off);
+  return 0;
+}
 
-    memcpy (dst, buf, dst_len);
+int hc_clGetDeviceIDs (hashcat_ctx_t *hashcat_ctx, cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    memset (dst + dst_len, 0, dst_pw_len4 - dst_len);
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    // prepare next element
+  const cl_int CL_err = ocl->clGetDeviceIDs (platform, device_type, num_entries, devices, num_devices);
 
-    pw_idx_t *pw_idx_dst_next = pw_idx_dst + 1;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetDeviceIDs(): %s", val2cstr_cl (CL_err));
 
-    pw_idx_dst_next->off = pw_idx_dst->off + pw_idx_dst->cnt;
+    return -1;
   }
 
-  memcpy (device_param->pws_comp, tmp_pws_comp, device_param->size_pws_comp);
-  memcpy (device_param->pws_idx,  tmp_pws_idx,  device_param->size_pws_idx);
-
-  hcfree (tmp_pws_comp);
-  hcfree (tmp_pws_idx);
+  return 0;
 }
 
-int run_cuda_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num)
+int hc_clGetDeviceInfo (hashcat_ctx_t *hashcat_ctx, cl_device_id device, cl_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
 {
-  u64 num_elements = num;
-
-  device_param->kernel_params_atinit[0]       = (void *) &buf;
-  device_param->kernel_params_atinit_buf64[1] = num_elements;
-
-  const u64 kernel_threads = device_param->kernel_wgs_atinit;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  num_elements = CEILDIV (num_elements, kernel_threads);
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-  CUfunction function = device_param->cuda_function_atinit;
+  const cl_int CL_err = ocl->clGetDeviceInfo (device, param_name, param_value_size, param_value, param_value_size_ret);
 
-  if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_atinit, NULL) == -1) return -1;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetDeviceInfo(): %s", val2cstr_cl (CL_err));
 
-  if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
+    return -1;
+  }
 
   return 0;
 }
 
-int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u32 value, const u64 size)
+int hc_clCreateContext (hashcat_ctx_t *hashcat_ctx, const cl_context_properties *properties, cl_uint num_devices, const cl_device_id *devices, void (CL_CALLBACK *pfn_notify) (const char *errinfo, const void *private_info, size_t cb, void *user_data), void *user_data, cl_context *context)
 {
-  const u64 num16d = size / 16;
-  const u64 num16m = size % 16;
-
-  if (num16d)
-  {
-    device_param->kernel_params_memset[0]       = (void *) &buf;
-    device_param->kernel_params_memset_buf32[1] = value;
-    device_param->kernel_params_memset_buf64[2] = num16d;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    const u64 kernel_threads = device_param->kernel_wgs_memset;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    u64 num_elements = num16d;
+  cl_int CL_err;
 
-    num_elements = CEILDIV (num_elements, kernel_threads);
+  *context = ocl->clCreateContext (properties, num_devices, devices, pfn_notify, user_data, &CL_err);
 
-    CUfunction function = device_param->cuda_function_memset;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clCreateContext(): %s", val2cstr_cl (CL_err));
 
-    //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem),   (void *) &buf);                         if (CU_rc == -1) return -1;
-    //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (CU_rc == -1) return -1;
-    //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CU_rc == -1) return -1;
+    return -1;
+  }
 
-    //const size_t global_work_size[3] = { num_elements,   1, 1 };
-    //const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+  return 0;
+}
 
-    if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_memset, NULL) == -1) return -1;
+int hc_clCreateCommandQueue (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_device_id device, cl_command_queue_properties properties, cl_command_queue *command_queue)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
-  }
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-  if (num16m)
-  {
-    u32 tmp[4];
+  cl_int CL_err;
 
-    tmp[0] = value;
-    tmp[1] = value;
-    tmp[2] = value;
-    tmp[3] = value;
+  *command_queue = ocl->clCreateCommandQueue (context, device, properties, &CL_err);
 
-    // Apparently are allowed to do this: https://devtalk.nvidia.com/default/topic/761515/how-to-copy-to-device-memory-with-offset-/
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clCreateCommandQueue(): %s", val2cstr_cl (CL_err));
 
-    if (hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1;
+    return -1;
   }
 
   return 0;
 }
 
-int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 size)
+int hc_clCreateBuffer (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_mem *mem)
 {
-  return run_cuda_kernel_memset (hashcat_ctx, device_param, buf, 0, size);
-}
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-int run_opencl_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num)
-{
-  u64 num_elements = num;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-  device_param->kernel_params_atinit_buf64[1] = num_elements;
+  cl_int CL_err;
 
-  const u64 kernel_threads = device_param->kernel_wgs_atinit;
+  *mem = ocl->clCreateBuffer (context, flags, size, host_ptr, &CL_err);
 
-  num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clCreateBuffer(): %s", val2cstr_cl (CL_err));
 
-  cl_kernel kernel = device_param->opencl_kernel_atinit;
+    return -1;
+  }
 
-  const size_t global_work_size[3] = { num_elements,    1, 1 };
-  const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
+  return 0;
+}
 
-  if (hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem), (void *) &buf) == -1) return -1;
+int hc_clCreateProgramWithSource (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_program *program)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  if (hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]) == -1) return -1;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-  if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
+  cl_int CL_err;
 
-  if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+  *program = ocl->clCreateProgramWithSource (context, count, strings, lengths, &CL_err);
 
-  if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clCreateProgramWithSource(): %s", val2cstr_cl (CL_err));
+
+    return -1;
+  }
 
   return 0;
 }
 
-int run_opencl_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size)
+int hc_clCreateProgramWithBinary (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_uint num_devices, const cl_device_id *device_list, const size_t *lengths, const unsigned char **binaries, cl_int *binary_status, cl_program *program)
 {
-  const u64 num16d = size / 16;
-  const u64 num16m = size % 16;
-
-  if (num16d)
-  {
-    device_param->kernel_params_memset_buf32[1] = value;
-    device_param->kernel_params_memset_buf64[2] = num16d;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    const u64 kernel_threads = device_param->kernel_wgs_memset;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    u64 num_elements = num16d;
+  cl_int CL_err;
 
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+  *program = ocl->clCreateProgramWithBinary (context, num_devices, device_list, lengths, binaries, binary_status, &CL_err);
 
-    cl_kernel kernel = device_param->opencl_kernel_memset;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clCreateProgramWithBinary(): %s", val2cstr_cl (CL_err));
 
-    if (hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem),   (void *) &buf) == -1)                         return -1;
-    if (hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]) == -1) return -1;
-    if (hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]) == -1) return -1;
+    return -1;
+  }
 
-    const size_t global_work_size[3] = { num_elements,   1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+  return 0;
+}
 
-    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
+int hc_clBuildProgram (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
-  }
+  const cl_int CL_err = ocl->clBuildProgram (program, num_devices, device_list, options, pfn_notify, user_data);
 
-  if (num16m)
+  if (CL_err != CL_SUCCESS)
   {
-    u32 tmp[4];
-
-    tmp[0] = value;
-    tmp[1] = value;
-    tmp[2] = value;
-    tmp[3] = value;
+    event_log_error (hashcat_ctx, "clBuildProgram(): %s", val2cstr_cl (CL_err));
 
-    if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, buf, CL_TRUE, num16d * 16, num16m, tmp, 0, NULL, NULL) == -1) return -1;
+    return -1;
   }
 
   return 0;
 }
 
-int run_opencl_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size)
-{
-  return run_opencl_kernel_memset (hashcat_ctx, device_param, buf, 0, size);
-}
-
-int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num, const u32 event_update, const u32 iteration)
+int hc_clCreateKernel (hashcat_ctx_t *hashcat_ctx, cl_program program, const char *kernel_name, cl_kernel *kernel)
 {
-  const hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
-  const status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  u64 kernel_threads = 0;
-  u64 dynamic_shared_mem = 0;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-  switch (kern_run)
-  {
-    case KERN_RUN_1:
-      kernel_threads     = device_param->kernel_wgs1;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size1;
-      break;
-    case KERN_RUN_12:
-      kernel_threads     = device_param->kernel_wgs12;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size12;
-      break;
-    case KERN_RUN_2:
-      kernel_threads     = device_param->kernel_wgs2;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size2;
-      break;
-    case KERN_RUN_2E:
-      kernel_threads     = device_param->kernel_wgs2e;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size2e;
-      break;
-    case KERN_RUN_23:
-      kernel_threads     = device_param->kernel_wgs23;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size23;
-      break;
-    case KERN_RUN_3:
-      kernel_threads     = device_param->kernel_wgs3;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size3;
-      break;
-    case KERN_RUN_4:
-      kernel_threads     = device_param->kernel_wgs4;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size4;
-      break;
-    case KERN_RUN_INIT2:
-      kernel_threads     = device_param->kernel_wgs_init2;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_init2;
-      break;
-    case KERN_RUN_LOOP2:
-      kernel_threads     = device_param->kernel_wgs_loop2;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_loop2;
-      break;
-    case KERN_RUN_AUX1:
-      kernel_threads     = device_param->kernel_wgs_aux1;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux1;
-      break;
-    case KERN_RUN_AUX2:
-      kernel_threads     = device_param->kernel_wgs_aux2;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux2;
-      break;
-    case KERN_RUN_AUX3:
-      kernel_threads     = device_param->kernel_wgs_aux3;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux3;
-      break;
-    case KERN_RUN_AUX4:
-      kernel_threads     = device_param->kernel_wgs_aux4;
-      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux4;
-      break;
-  }
+  cl_int CL_err;
 
-  if ((hashconfig->opts_type & OPTS_TYPE_DYNAMIC_SHARED) == 0)
-  {
-    dynamic_shared_mem = 0;
-  }
+  *kernel = ocl->clCreateKernel (program, kernel_name, &CL_err);
 
-  if (device_param->is_cuda == true)
+  if (CL_err != CL_SUCCESS)
   {
-    if ((device_param->kernel_dynamic_local_mem_size_memset % device_param->device_local_mem_size) == 0)
-    {
-      // this is the case Compute Capability 7.5
-      // there is also Compute Capability 7.0 which offers a larger dynamic local size access
-      // however, if it's an exact multiple the driver can optimize this for us more efficient
+    event_log_error (hashcat_ctx, "clCreateKernel(): %s", val2cstr_cl (CL_err));
 
-      dynamic_shared_mem = 0;
-    }
+    return -1;
   }
 
-  kernel_threads = MIN (kernel_threads, device_param->kernel_threads);
+  return 0;
+}
 
-  device_param->kernel_params_buf64[34] = num;
+int hc_clReleaseMemObject (hashcat_ctx_t *hashcat_ctx, cl_mem mem)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  u64 num_elements = num;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-  if (device_param->is_cuda == true)
+  const cl_int CL_err = ocl->clReleaseMemObject (mem);
+
+  if (CL_err != CL_SUCCESS)
   {
-    CUfunction cuda_function = NULL;
+    event_log_error (hashcat_ctx, "clReleaseMemObject(): %s", val2cstr_cl (CL_err));
 
-    if (device_param->is_cuda == true)
-    {
-      switch (kern_run)
-      {
-        case KERN_RUN_1:      cuda_function = device_param->cuda_function1;      break;
-        case KERN_RUN_12:     cuda_function = device_param->cuda_function12;     break;
-        case KERN_RUN_2:      cuda_function = device_param->cuda_function2;      break;
-        case KERN_RUN_2E:     cuda_function = device_param->cuda_function2e;     break;
-        case KERN_RUN_23:     cuda_function = device_param->cuda_function23;     break;
-        case KERN_RUN_3:      cuda_function = device_param->cuda_function3;      break;
-        case KERN_RUN_4:      cuda_function = device_param->cuda_function4;      break;
-        case KERN_RUN_INIT2:  cuda_function = device_param->cuda_function_init2; break;
-        case KERN_RUN_LOOP2:  cuda_function = device_param->cuda_function_loop2; break;
-        case KERN_RUN_AUX1:   cuda_function = device_param->cuda_function_aux1;  break;
-        case KERN_RUN_AUX2:   cuda_function = device_param->cuda_function_aux2;  break;
-        case KERN_RUN_AUX3:   cuda_function = device_param->cuda_function_aux3;  break;
-        case KERN_RUN_AUX4:   cuda_function = device_param->cuda_function_aux4;  break;
-      }
+    return -1;
+  }
 
-      if (hc_cuFuncSetAttribute (hashcat_ctx, cuda_function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, dynamic_shared_mem) == -1) return -1;
-    }
+  return 0;
+}
 
-    if (kernel_threads == 0) kernel_threads = 1;
+int hc_clReleaseKernel (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    num_elements = CEILDIV (num_elements, kernel_threads);
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    if (kern_run == KERN_RUN_1)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
-      {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
-      }
-    }
-    else if (kern_run == KERN_RUN_2)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
-      {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
-      }
-    }
-    else if (kern_run == KERN_RUN_3)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
-      {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
-      }
-    }
+  const cl_int CL_err = ocl->clReleaseKernel (kernel);
 
-    if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event1, device_param->cuda_stream) == -1) return -1;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clReleaseKernel(): %s", val2cstr_cl (CL_err));
 
-    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
+    return -1;
+  }
 
-    if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event2, device_param->cuda_stream) == -1) return -1;
+  return 0;
+}
 
-    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
+int hc_clReleaseProgram (hashcat_ctx_t *hashcat_ctx, cl_program program)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    if (hc_cuEventSynchronize (hashcat_ctx, device_param->cuda_event2) == -1) return -1;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    float exec_ms;
+  const cl_int CL_err = ocl->clReleaseProgram (program);
 
-    if (hc_cuEventElapsedTime (hashcat_ctx, &exec_ms, device_param->cuda_event1, device_param->cuda_event2) == -1) return -1;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clReleaseProgram(): %s", val2cstr_cl (CL_err));
 
-    if (event_update)
-    {
-      u32 exec_pos = device_param->exec_pos;
+    return -1;
+  }
 
-      device_param->exec_msec[exec_pos] = exec_ms;
+  return 0;
+}
 
-      exec_pos++;
+int hc_clReleaseCommandQueue (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-      if (exec_pos == EXEC_CACHE)
-      {
-        exec_pos = 0;
-      }
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-      device_param->exec_pos = exec_pos;
-    }
-  }
+  const cl_int CL_err = ocl->clReleaseCommandQueue (command_queue);
 
-  if (device_param->is_opencl == true)
+  if (CL_err != CL_SUCCESS)
   {
-    cl_kernel opencl_kernel = NULL;
-
-    if (device_param->is_opencl == true)
-    {
-      switch (kern_run)
-      {
-        case KERN_RUN_1:      opencl_kernel = device_param->opencl_kernel1;      break;
-        case KERN_RUN_12:     opencl_kernel = device_param->opencl_kernel12;     break;
-        case KERN_RUN_2:      opencl_kernel = device_param->opencl_kernel2;      break;
-        case KERN_RUN_2E:     opencl_kernel = device_param->opencl_kernel2e;     break;
-        case KERN_RUN_23:     opencl_kernel = device_param->opencl_kernel23;     break;
-        case KERN_RUN_3:      opencl_kernel = device_param->opencl_kernel3;      break;
-        case KERN_RUN_4:      opencl_kernel = device_param->opencl_kernel4;      break;
-        case KERN_RUN_INIT2:  opencl_kernel = device_param->opencl_kernel_init2; break;
-        case KERN_RUN_LOOP2:  opencl_kernel = device_param->opencl_kernel_loop2; break;
-        case KERN_RUN_AUX1:   opencl_kernel = device_param->opencl_kernel_aux1;  break;
-        case KERN_RUN_AUX2:   opencl_kernel = device_param->opencl_kernel_aux2;  break;
-        case KERN_RUN_AUX3:   opencl_kernel = device_param->opencl_kernel_aux3;  break;
-        case KERN_RUN_AUX4:   opencl_kernel = device_param->opencl_kernel_aux4;  break;
-      }
-    }
-
-    for (u32 i = 0; i <= 23; i++)
-    {
-      if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_mem), device_param->kernel_params[i]) == -1) return -1;
-    }
+    event_log_error (hashcat_ctx, "clReleaseCommandQueue(): %s", val2cstr_cl (CL_err));
 
-    for (u32 i = 24; i <= 33; i++)
-    {
-      if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_uint), device_param->kernel_params[i]) == -1) return -1;
-    }
+    return -1;
+  }
 
-    for (u32 i = 34; i <= 34; i++)
-    {
-      if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]) == -1) return -1;
-    }
+  return 0;
+}
 
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+int hc_clReleaseContext (hashcat_ctx_t *hashcat_ctx, cl_context context)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    cl_event opencl_event;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    if (kern_run == KERN_RUN_1)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
-      {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
-      }
-    }
-    else if (kern_run == KERN_RUN_2)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
-      {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
-      }
-    }
-    else if (kern_run == KERN_RUN_3)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
-      {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
-      }
-    }
+  const cl_int CL_err = ocl->clReleaseContext (context);
 
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clReleaseContext(): %s", val2cstr_cl (CL_err));
 
-    const size_t global_work_size[3] = { num_elements,   1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+    return -1;
+  }
 
-    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event) == -1) return -1;
+  return 0;
+}
 
-    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+int hc_clEnqueueMapBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event, void **buf)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    // spin damper section
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    const u32 iterationm = iteration % EXPECTED_ITERATIONS;
+  cl_int CL_err;
 
-    if (device_param->spin_damp > 0)
-    {
-      cl_int opencl_event_status;
+  *buf = ocl->clEnqueueMapBuffer (command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, &CL_err);
 
-      size_t param_value_size_ret;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueMapBuffer(): %s", val2cstr_cl (CL_err));
 
-      if (hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret) == -1) return -1;
+    return -1;
+  }
 
-      double spin_total = device_param->spin_damp;
+  return 0;
+}
 
-      while (opencl_event_status != CL_COMPLETE)
-      {
-        if (status_ctx->devices_status == STATUS_RUNNING)
-        {
-          switch (kern_run)
-          {
-            case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_2E:     if (device_param->exec_us_prev2e[iterationm]     > 0) usleep ((useconds_t) (device_param->exec_us_prev2e[iterationm]     * device_param->spin_damp)); break;
-            case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break;
-            case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break;
-            case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]  * device_param->spin_damp)); break;
-            case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]  * device_param->spin_damp)); break;
-            case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]  * device_param->spin_damp)); break;
-            case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]  * device_param->spin_damp)); break;
-          }
-        }
-        else
-        {
-          // we were told to be nice
+int hc_clEnqueueUnmapMemObject (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-          sleep (0);
-        }
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-        if (hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret) == -1) return -1;
+  const cl_int CL_err = ocl->clEnqueueUnmapMemObject (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
 
-        spin_total += device_param->spin_damp;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clEnqueueUnmapMemObject(): %s", val2cstr_cl (CL_err));
 
-        if (spin_total > 1) break;
-      }
-    }
+    return -1;
+  }
 
-    if (hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event) == -1) return -1;
+  return 0;
+}
 
-    cl_ulong time_start;
-    cl_ulong time_end;
+int hc_clGetKernelWorkGroupInfo (hashcat_ctx_t *hashcat_ctx, cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    if (hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL) == -1) return -1;
-    if (hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_END,   sizeof (time_end),   &time_end,   NULL) == -1) return -1;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    const double exec_us = (double) (time_end - time_start) / 1000;
+  const cl_int CL_err = ocl->clGetKernelWorkGroupInfo (kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
 
-    if (device_param->spin_damp > 0)
-    {
-      if (status_ctx->devices_status == STATUS_RUNNING)
-      {
-        switch (kern_run)
-        {
-          case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]      = exec_us; break;
-          case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]      = exec_us; break;
-          case KERN_RUN_2E:     device_param->exec_us_prev2e[iterationm]     = exec_us; break;
-          case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]      = exec_us; break;
-          case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]      = exec_us; break;
-          case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm] = exec_us; break;
-          case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm] = exec_us; break;
-          case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]  = exec_us; break;
-          case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]  = exec_us; break;
-          case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]  = exec_us; break;
-          case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]  = exec_us; break;
-        }
-      }
-    }
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetKernelWorkGroupInfo(): %s", val2cstr_cl (CL_err));
 
-    if (event_update)
-    {
-      u32 exec_pos = device_param->exec_pos;
+    return -1;
+  }
 
-      device_param->exec_msec[exec_pos] = exec_us / 1000;
+  return 0;
+}
 
-      exec_pos++;
+int hc_clGetProgramBuildInfo (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_device_id device, cl_program_build_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-      if (exec_pos == EXEC_CACHE)
-      {
-        exec_pos = 0;
-      }
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-      device_param->exec_pos = exec_pos;
-    }
+  const cl_int CL_err = ocl->clGetProgramBuildInfo (program, device, param_name, param_value_size, param_value, param_value_size_ret);
 
-    if (hc_clReleaseEvent (hashcat_ctx, opencl_event) == -1) return -1;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetProgramBuildInfo(): %s", val2cstr_cl (CL_err));
 
-    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+    return -1;
   }
 
   return 0;
 }
 
-int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num)
+int hc_clGetProgramInfo (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_program_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
 {
-  u64 kernel_threads = 0;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  switch (kern_run)
-  {
-    case KERN_RUN_MP:   kernel_threads  = device_param->kernel_wgs_mp;    break;
-    case KERN_RUN_MP_R: kernel_threads  = device_param->kernel_wgs_mp_r;  break;
-    case KERN_RUN_MP_L: kernel_threads  = device_param->kernel_wgs_mp_l;  break;
-  }
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-  u64 num_elements = num;
+  const cl_int CL_err = ocl->clGetProgramInfo (program, param_name, param_value_size, param_value, param_value_size_ret);
 
-  switch (kern_run)
+  if (CL_err != CL_SUCCESS)
   {
-    case KERN_RUN_MP:   device_param->kernel_params_mp_buf64[8]   = num; break;
-    case KERN_RUN_MP_R: device_param->kernel_params_mp_r_buf64[8] = num; break;
-    case KERN_RUN_MP_L: device_param->kernel_params_mp_l_buf64[9] = num; break;
+    event_log_error (hashcat_ctx, "clGetProgramInfo(): %s", val2cstr_cl (CL_err));
+
+    return -1;
   }
 
-  if (device_param->is_cuda == true)
-  {
-    CUfunction cuda_function = NULL;
+  return 0;
+}
 
-    void **cuda_args = NULL;
+int hc_clWaitForEvents (hashcat_ctx_t *hashcat_ctx, cl_uint num_events, const cl_event *event_list)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    switch (kern_run)
-    {
-      case KERN_RUN_MP:   cuda_function = device_param->cuda_function_mp;
-                          cuda_args     = device_param->kernel_params_mp;
-                          break;
-      case KERN_RUN_MP_R: cuda_function = device_param->cuda_function_mp_r;
-                          cuda_args     = device_param->kernel_params_mp_r;
-                          break;
-      case KERN_RUN_MP_L: cuda_function = device_param->cuda_function_mp_l;
-                          cuda_args     = device_param->kernel_params_mp_l;
-                          break;
-    }
-
-    num_elements = CEILDIV (num_elements, kernel_threads);
-
-    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, cuda_args, NULL) == -1) return -1;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
-  }
+  const cl_int CL_err = ocl->clWaitForEvents (num_events, event_list);
 
-  if (device_param->is_opencl == true)
+  if (CL_err != CL_SUCCESS)
   {
-    cl_kernel opencl_kernel = NULL;
+    event_log_error (hashcat_ctx, "clWaitForEvents(): %s", val2cstr_cl (CL_err));
 
-    switch (kern_run)
-    {
-      case KERN_RUN_MP:   opencl_kernel = device_param->opencl_kernel_mp;   break;
-      case KERN_RUN_MP_R: opencl_kernel = device_param->opencl_kernel_mp_r; break;
-      case KERN_RUN_MP_L: opencl_kernel = device_param->opencl_kernel_mp_l; break;
-    }
+    return -1;
+  }
 
-    switch (kern_run)
-    {
-      case KERN_RUN_MP:   if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp[3]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp[4]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp[5]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp[6]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp[7]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 8, sizeof (cl_ulong), device_param->kernel_params_mp[8]) == -1) return -1;
-                          break;
-      case KERN_RUN_MP_R: if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp_r[3]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp_r[4]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp_r[5]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp_r[6]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp_r[7]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 8, sizeof (cl_ulong), device_param->kernel_params_mp_r[8]) == -1) return -1;
-                          break;
-      case KERN_RUN_MP_L: if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp_l[3]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp_l[4]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp_l[5]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp_l[6]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp_l[7]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 8, sizeof (cl_uint),  device_param->kernel_params_mp_l[8]) == -1) return -1;
-                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 9, sizeof (cl_ulong), device_param->kernel_params_mp_l[9]) == -1) return -1;
-                          break;
-    }
+  return 0;
+}
 
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+int hc_clGetEventProfilingInfo (hashcat_ctx_t *hashcat_ctx, cl_event event, cl_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    const size_t global_work_size[3] = { num_elements,   1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
+  const cl_int CL_err = ocl->clGetEventProfilingInfo (event, param_name, param_value_size, param_value, param_value_size_ret);
 
-    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue)  == -1) return -1;
+  if (CL_err != CL_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "clGetEventProfilingInfo(): %s", val2cstr_cl (CL_err));
 
-    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+    return -1;
   }
 
   return 0;
 }
 
-int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
+int hc_clReleaseEvent (hashcat_ctx_t *hashcat_ctx, cl_event event)
 {
-  const u64 num_elements = 1024; // fixed
-
-  const u64 kernel_threads = MIN (num_elements, device_param->kernel_wgs_tm);
-
-  if (device_param->is_cuda == true)
-  {
-    CUfunction cuda_function = device_param->cuda_function_tm;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements / kernel_threads, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_tm, NULL) == -1) return -1;
+  OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
-  }
+  const cl_int CL_err = ocl->clReleaseEvent (event);
 
-  if (device_param->is_opencl == true)
+  if (CL_err != CL_SUCCESS)
   {
-    cl_kernel cuda_kernel = device_param->opencl_kernel_tm;
-
-    const size_t global_work_size[3] = { num_elements,    1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
-
-    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, cuda_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
-
-    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+    event_log_error (hashcat_ctx, "clReleaseEvent(): %s", val2cstr_cl (CL_err));
 
-    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+    return -1;
   }
 
   return 0;
 }
 
-int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num)
-{
-  device_param->kernel_params_amp_buf64[6] = num;
+// Backend
 
-  u64 num_elements = num;
+int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 gidd, pw_t *pw)
+{
+  pw_idx_t pw_idx;
 
-  const u64 kernel_threads = device_param->kernel_wgs_amp;
+  pw_idx.off = 0;
+  pw_idx.cnt = 0;
+  pw_idx.len = 0;
 
   if (device_param->is_cuda == true)
   {
-    num_elements = CEILDIV (num_elements, kernel_threads);
-
-    CUfunction cuda_function = device_param->cuda_function_amp;
+    if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1) return -1;
 
-    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_amp, NULL) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, &pw_idx, device_param->cuda_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t)) == -1) return -1;
 
-    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
+    if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return -1;
   }
 
-  if (device_param->is_opencl == true)
+  if (device_param->is_hip == true)
   {
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
-
-    cl_kernel opencl_kernel = device_param->opencl_kernel_amp;
-
-    if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_ulong), device_param->kernel_params_amp[6]) == -1) return -1;
+    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
 
-    const size_t global_work_size[3] = { num_elements,    1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
-
-    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, &pw_idx, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t)) == -1) return -1;
 
-    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue)  == -1) return -1;
-
-    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return -1;
   }
 
-  return 0;
-}
-
-int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num)
-{
-  device_param->kernel_params_decompress_buf64[3] = num;
-
-  u64 num_elements = num;
+  if (device_param->is_opencl == true)
+  {
+    if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, gidd * sizeof (pw_idx_t), sizeof (pw_idx_t), &pw_idx, 0, NULL, NULL) == -1) return -1;
+  }
 
-  const u64 kernel_threads = device_param->kernel_wgs_decompress;
+  const u32 off = pw_idx.off;
+  const u32 cnt = pw_idx.cnt;
+  const u32 len = pw_idx.len;
 
   if (device_param->is_cuda == true)
   {
-    num_elements = CEILDIV (num_elements, kernel_threads);
-
-    CUfunction cuda_function = device_param->cuda_function_decompress;
+    if (cnt > 0)
+    {
+      if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1) return -1;
 
-    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_decompress, NULL) == -1) return -1;
+      if (hc_cuMemcpyDtoH (hashcat_ctx,pw->i, device_param->cuda_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
-    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
+      if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return -1;
+    }
   }
 
-  if (device_param->is_opencl == true)
+  if (device_param->is_hip == true)
   {
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
-
-    cl_kernel opencl_kernel = device_param->opencl_kernel_decompress;
-
-    const size_t global_work_size[3] = { num_elements,    1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
+    if (cnt > 0)
+    {
+      if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
 
-    if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]) == -1) return -1;
+      if (hc_hipMemcpyDtoH (hashcat_ctx,pw->i, device_param->hip_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
-    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
+      if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return -1;
+    }
+  }
 
-    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+  if (device_param->is_opencl == true)
+  {
+    if (cnt > 0)
+    {
+      if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, off * sizeof (u32), cnt * sizeof (u32), pw->i, 0, NULL, NULL) == -1) return -1;
+    }
+  }
 
-    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+  for (u32 i = cnt; i < 64; i++)
+  {
+    pw->i[i] = 0;
   }
 
+  pw->pw_len = len;
+
   return 0;
 }
 
-int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt)
+int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 highest_pw_len, const u64 pws_cnt, const u32 fast_iteration, const u32 salt_pos)
 {
-  combinator_ctx_t     *combinator_ctx      = hashcat_ctx->combinator_ctx;
-  hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
-  user_options_t       *user_options        = hashcat_ctx->user_options;
-  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
-
-  // init speed timer
+  hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
+  hashes_t       *hashes       = hashcat_ctx->hashes;
+  module_ctx_t   *module_ctx   = hashcat_ctx->module_ctx;
+  status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
+  user_options_t *user_options = hashcat_ctx->user_options;
 
-  #if defined (_WIN)
-  if (device_param->timer_speed.QuadPart == 0)
-  {
-    hc_timer_set (&device_param->timer_speed);
-  }
-  #else
-  if (device_param->timer_speed.tv_sec == 0)
+  if (user_options->stdout_flag == true)
   {
-    hc_timer_set (&device_param->timer_speed);
+    return process_stdout (hashcat_ctx, device_param, pws_cnt);
   }
-  #endif
 
-  if (user_options->slow_candidates == true)
+  if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
   {
-    if (device_param->is_cuda == true)
+    if (user_options->attack_mode == ATTACK_MODE_BF)
     {
-      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        if (hashconfig->opts_type & OPTS_TYPE_TM_KERNEL)
+        {
+          const u32 size_tm = device_param->size_tm;
 
-      const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+          if (device_param->is_cuda == true)
+          {
+            if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tm_c, size_tm) == -1) return -1;
+          }
 
-      const u32 off = pw_idx->off;
+          if (device_param->is_hip == true)
+          {
+            if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_tm_c, size_tm) == -1) return -1;
+          }
 
-      if (off)
-      {
-        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
-      }
-    }
+          if (device_param->is_opencl == true)
+          {
+            if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_tm_c, size_tm) == -1) return -1;
+          }
 
-    if (device_param->is_opencl == true)
-    {
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+          if (run_kernel_tm (hashcat_ctx, device_param) == -1) return -1;
 
-      const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+          if (device_param->is_cuda == true)
+          {
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_tm_c, size_tm) == -1) return -1;
+          }
 
-      const u32 off = pw_idx->off;
+          if (device_param->is_hip == true)
+          {
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_bfs_c, device_param->hip_d_tm_c, size_tm) == -1) return -1;
+          }
 
-      if (off)
-      {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+          if (device_param->is_opencl == true)
+          {
+            if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tm_c, device_param->opencl_d_bfs_c, 0, 0, size_tm, 0, NULL, NULL) == -1) return -1;
+          }
+        }
       }
     }
 
-    if (run_kernel_decompress (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
+    if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+    {
+      if (highest_pw_len < 16)
+      {
+        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_1, pws_cnt, true, fast_iteration) == -1) return -1;
+      }
+      else if (highest_pw_len < 32)
+      {
+        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_cnt, true, fast_iteration) == -1) return -1;
+      }
+      else
+      {
+        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_3, pws_cnt, true, fast_iteration) == -1) return -1;
+      }
+    }
+    else
+    {
+      if (run_kernel (hashcat_ctx, device_param, KERN_RUN_4, pws_cnt, true, fast_iteration) == -1) return -1;
+    }
   }
   else
   {
-    if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+    bool run_init = true;
+    bool run_loop = true;
+    bool run_comp = true;
+
+    if (run_init == true)
     {
       if (device_param->is_cuda == true)
       {
-        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
-
-        const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
-
-        const u32 off = pw_idx->off;
+        if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_pws_buf, device_param->cuda_d_pws_amp_buf, pws_cnt * sizeof (pw_t)) == -1) return -1;
+      }
 
-        if (off)
-        {
-          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
-        }
+      if (device_param->is_hip == true)
+      {
+        if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_pws_buf, device_param->hip_d_pws_amp_buf, pws_cnt * sizeof (pw_t)) == -1) return -1;
       }
 
       if (device_param->is_opencl == true)
       {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
-
-        const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
-
-        const u32 off = pw_idx->off;
+        if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_amp_buf, device_param->opencl_d_pws_buf, 0, 0, pws_cnt * sizeof (pw_t), 0, NULL, NULL) == -1) return -1;
+      }
 
-        if (off)
-        {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
-        }
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        if (run_kernel_amp (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
       }
 
-      if (run_kernel_decompress (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
-    }
-    else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      if (run_kernel (hashcat_ctx, device_param, KERN_RUN_1, pws_cnt, false, 0) == -1) return -1;
+
+      if (hashconfig->opts_type & OPTS_TYPE_HOOK12)
       {
-        if (user_options->attack_mode == ATTACK_MODE_COMBI)
-        {
-          if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_RIGHT)
-          {
-            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
-            {
-              rebuild_pws_compressed_append (device_param, pws_cnt, 0x01);
-            }
-            else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
-            {
-              rebuild_pws_compressed_append (device_param, pws_cnt, 0x06);
-            }
-            else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
-            {
-              rebuild_pws_compressed_append (device_param, pws_cnt, 0x80);
-            }
-          }
-        }
-        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
-        {
-          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
-          {
-            rebuild_pws_compressed_append (device_param, pws_cnt, 0x01);
-          }
-          else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
-          {
-            rebuild_pws_compressed_append (device_param, pws_cnt, 0x06);
-          }
-          else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
-          {
-            rebuild_pws_compressed_append (device_param, pws_cnt, 0x80);
-          }
-        }
+        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_12, pws_cnt, false, 0) == -1) return -1;
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
-
-          const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
-
-          const u32 off = pw_idx->off;
+          if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
+        }
 
-          if (off)
-          {
-            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
-          }
+        if (device_param->is_hip == true)
+        {
+          if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
         }
 
         if (device_param->is_opencl == true)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
-
-          const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
-
-          const u32 off = pw_idx->off;
-
-          if (off)
-          {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
-          }
+          if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
         }
 
-        if (run_kernel_decompress (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
-      }
-      else
-      {
-        if (user_options->attack_mode == ATTACK_MODE_COMBI)
-        {
-          if (device_param->is_cuda == true)
-          {
-            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
+        const int hook_threads = (int) user_options->hook_threads;
 
-            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+        hook_thread_param_t *hook_threads_param = (hook_thread_param_t *) hccalloc (hook_threads, sizeof (hook_thread_param_t));
 
-            const u32 off = pw_idx->off;
+        for (int i = 0; i < hook_threads; i++)
+        {
+          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
 
-            if (off)
-            {
-              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
-            }
-          }
+          hook_thread_param->tid = i;
+          hook_thread_param->tsz = hook_threads;
 
-          if (device_param->is_opencl == true)
-          {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+          hook_thread_param->module_ctx = module_ctx;
+          hook_thread_param->status_ctx = status_ctx;
 
-            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+          hook_thread_param->device_param = device_param;
 
-            const u32 off = pw_idx->off;
+          hook_thread_param->hook_salts_buf = hashes->hook_salts_buf;
 
-            if (off)
-            {
-              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
-            }
-          }
+          hook_thread_param->salt_pos = salt_pos;
 
-          if (run_kernel_decompress (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
+          hook_thread_param->pws_cnt = pws_cnt;
         }
-        else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
-        {
-          if (device_param->is_cuda == true)
-          {
-            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
-            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
-
-            const u32 off = pw_idx->off;
+        hc_thread_t *c_threads = (hc_thread_t *) hccalloc (hook_threads, sizeof (hc_thread_t));
 
-            if (off)
-            {
-              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
-            }
-          }
+        for (int i = 0; i < hook_threads; i++)
+        {
+          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
 
-          if (device_param->is_opencl == true)
-          {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+          hc_thread_create (c_threads[i], hook12_thread, hook_thread_param);
+        }
 
-            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
+        hc_thread_wait (hook_threads, c_threads);
 
-            const u32 off = pw_idx->off;
+        hcfree (c_threads);
 
-            if (off)
-            {
-              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
-            }
-          }
+        hcfree (hook_threads_param);
 
-          if (run_kernel_decompress (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
-        }
-        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+        if (device_param->is_cuda == true)
         {
-          const u64 off = device_param->words_off;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
+        }
 
-          device_param->kernel_params_mp_buf64[3] = off;
+        if (device_param->is_hip == true)
+        {
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
+        }
 
-          if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, pws_cnt) == -1) return -1;
+        if (device_param->is_opencl == true)
+        {
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
         }
       }
     }
-    else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+
+    if (run_loop == true)
     {
-      const u64 off = device_param->words_off;
+      u32 iter = hashes->salts_buf[salt_pos].salt_iter;
 
-      device_param->kernel_params_mp_l_buf64[3] = off;
+      u32 loop_step = device_param->kernel_loops;
 
-      if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP_L, pws_cnt) == -1) return -1;
-    }
-  }
+      for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
+      {
+        u32 loop_left = iter - loop_pos;
 
-  return 0;
-}
+        loop_left = MIN (loop_left, loop_step);
 
-int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt)
-{
-  combinator_ctx_t      *combinator_ctx     = hashcat_ctx->combinator_ctx;
-  hashconfig_t          *hashconfig         = hashcat_ctx->hashconfig;
-  hashes_t              *hashes             = hashcat_ctx->hashes;
-  mask_ctx_t            *mask_ctx           = hashcat_ctx->mask_ctx;
-  status_ctx_t          *status_ctx         = hashcat_ctx->status_ctx;
-  straight_ctx_t        *straight_ctx       = hashcat_ctx->straight_ctx;
-  user_options_t        *user_options       = hashcat_ctx->user_options;
-  user_options_extra_t  *user_options_extra = hashcat_ctx->user_options_extra;
+        device_param->kernel_params_buf32[28] = loop_pos;
+        device_param->kernel_params_buf32[29] = loop_left;
 
-  // do the on-the-fly combinator mode encoding
+        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_cnt, true, slow_iteration) == -1) return -1;
 
-  bool iconv_enabled = false;
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
+        {
+          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2E, pws_cnt, true, slow_iteration) == -1) return -1;
+        }
 
-  iconv_t iconv_ctx = NULL;
+        //bug?
+        //while (status_ctx->run_thread_level2 == false) break;
+        if (status_ctx->run_thread_level2 == false) break;
 
-  char *iconv_tmp = NULL;
+        /**
+         * speed
+         */
 
-  if (strcmp (user_options->encoding_from, user_options->encoding_to) != 0)
-  {
-    iconv_enabled = true;
+        const float iter_part = (float) (loop_pos + loop_left) / iter;
 
-    iconv_ctx = iconv_open (user_options->encoding_to, user_options->encoding_from);
+        const u64 perf_sum_all = (u64) (pws_cnt * iter_part);
 
-    if (iconv_ctx == (iconv_t) -1) return -1;
+        double speed_msec = hc_timer_get (device_param->timer_speed);
 
-    iconv_tmp = (char *) hcmalloc (HCBUFSIZ_TINY);
-  }
-
-  // find higest password length, this is for optimization stuff
-
-  u32 highest_pw_len = 0;
-
-  if (user_options->slow_candidates == true)
-  {
-    /*
-    for (u64 pws_idx = 0; pws_idx < pws_cnt; pws_idx++)
-    {
-      pw_idx_t *pw_idx = device_param->pws_idx + pws_idx;
+        const u32 speed_pos = device_param->speed_pos;
 
-      highest_pw_len = MAX (highest_pw_len, pw_idx->len);
-    }
-    */
-  }
-  else
-  {
-    if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-    {
-    }
-    else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
-    {
-    }
-    else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
-    {
-      highest_pw_len = device_param->kernel_params_mp_l_buf32[4]
-                     + device_param->kernel_params_mp_l_buf32[5];
-    }
-  }
+        device_param->speed_cnt[speed_pos] = perf_sum_all;
 
-  // we make use of this in status view
+        device_param->speed_msec[speed_pos] = speed_msec;
 
-  device_param->outerloop_multi = 1;
-  device_param->outerloop_msec  = 0;
-  device_param->outerloop_pos   = 0;
-  device_param->outerloop_left  = pws_cnt;
+        if (user_options->speed_only == true)
+        {
+          if (speed_msec > 4000)
+          {
+            device_param->outerloop_multi *= (double) iter / (double) (loop_pos + loop_left);
 
-  // we ignore the time to copy data over pci bus in this case
+            device_param->speed_pos = 1;
 
-  if (user_options->speed_only == true)
-  {
-    hc_timer_set (&device_param->timer_speed);
-  }
+            device_param->speed_only_finish = true;
 
-  // loop start: most outer loop = salt iteration, then innerloops (if multi)
+            return 0;
+          }
+        }
+      }
 
-  for (u32 salt_pos = 0; salt_pos < hashes->salts_cnt; salt_pos++)
-  {
-    while (status_ctx->devices_status == STATUS_PAUSED) sleep (1);
+      if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
+      {
+        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_23, pws_cnt, false, 0) == -1) return -1;
 
-    salt_t *salt_buf = &hashes->salts_buf[salt_pos];
+        if (device_param->is_cuda == true)
+        {
+          if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
+        }
 
-    device_param->kernel_params_buf32[27] = salt_pos;
-    device_param->kernel_params_buf32[31] = salt_buf->digests_cnt;
-    device_param->kernel_params_buf32[32] = salt_buf->digests_offset;
+        if (device_param->is_hip == true)
+        {
+          if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
+        }
 
-    HCFILE *combs_fp = &device_param->combs_fp;
+        if (device_param->is_opencl == true)
+        {
+          if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
+        }
 
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      if ((user_options->attack_mode == ATTACK_MODE_COMBI) || (((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0) && (user_options->attack_mode == ATTACK_MODE_HYBRID2)))
-      {
-        hc_rewind (combs_fp);
-      }
-    }
+        const int hook_threads = (int) user_options->hook_threads;
 
-    // iteration type
+        hook_thread_param_t *hook_threads_param = (hook_thread_param_t *) hccalloc (hook_threads, sizeof (hook_thread_param_t));
 
-    u32 innerloop_step = 0;
-    u32 innerloop_cnt  = 0;
+        for (int i = 0; i < hook_threads; i++)
+        {
+          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
 
-    if (user_options->slow_candidates == true)
-    {
-      innerloop_step = 1;
-      innerloop_cnt  = 1;
-    }
-    else
-    {
-      if   (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) innerloop_step = device_param->kernel_loops;
-      else                                                        innerloop_step = 1;
+          hook_thread_param->tid = i;
+          hook_thread_param->tsz = hook_threads;
 
-      if      (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)  innerloop_cnt = straight_ctx->kernel_rules_cnt;
-      else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)     innerloop_cnt = (u32) combinator_ctx->combs_cnt;
-      else if (user_options_extra->attack_kern == ATTACK_KERN_BF)        innerloop_cnt = (u32) mask_ctx->bfs_cnt;
-    }
+          hook_thread_param->module_ctx = module_ctx;
+          hook_thread_param->status_ctx = status_ctx;
 
-    // innerloops
+          hook_thread_param->device_param = device_param;
 
-    for (u32 innerloop_pos = 0; innerloop_pos < innerloop_cnt; innerloop_pos += innerloop_step)
-    {
-      while (status_ctx->devices_status == STATUS_PAUSED) sleep (1);
+          hook_thread_param->hook_salts_buf = hashes->hook_salts_buf;
 
-      u32 fast_iteration = 0;
+          hook_thread_param->salt_pos = salt_pos;
 
-      u32 innerloop_left = innerloop_cnt - innerloop_pos;
+          hook_thread_param->pws_cnt = pws_cnt;
+        }
 
-      if (innerloop_left > innerloop_step)
-      {
-        innerloop_left = innerloop_step;
+        hc_thread_t *c_threads = (hc_thread_t *) hccalloc (hook_threads, sizeof (hc_thread_t));
 
-        fast_iteration = 1;
-      }
+        for (int i = 0; i < hook_threads; i++)
+        {
+          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
 
-      hc_thread_mutex_lock (status_ctx->mux_display);
+          hc_thread_create (c_threads[i], hook23_thread, hook_thread_param);
+        }
 
-      device_param->innerloop_pos  = innerloop_pos;
-      device_param->innerloop_left = innerloop_left;
+        hc_thread_wait (hook_threads, c_threads);
 
-      device_param->kernel_params_buf32[30] = innerloop_left;
+        hcfree (c_threads);
 
-      device_param->outerloop_multi = (double) innerloop_cnt / (double) (innerloop_pos + innerloop_left);
+        hcfree (hook_threads_param);
 
-      hc_thread_mutex_unlock (status_ctx->mux_display);
+        if (device_param->is_cuda == true)
+        {
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
+        }
 
-      if (hashes->salts_shown[salt_pos] == 1)
-      {
-        status_ctx->words_progress_done[salt_pos] += pws_cnt * innerloop_left;
+        if (device_param->is_hip == true)
+        {
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
+        }
 
-        continue;
+        if (device_param->is_opencl == true)
+        {
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
+        }
       }
+    }
 
-      // initialize and copy amplifiers
+    // init2 and loop2 are kind of special, we use run_loop for them, too
 
-      if (user_options->slow_candidates == true)
+    if (run_loop == true)
+    {
+      // note: they also do not influence the performance screen
+      // in case you want to use this, this cane make sense only if your input data comes out of tmps[]
+
+      if (hashconfig->opts_type & OPTS_TYPE_INIT2)
       {
+        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_INIT2, pws_cnt, false, 0) == -1) return -1;
       }
-      else
+
+      if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
       {
-        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-        {
-          if (device_param->is_cuda == true)
-          {
-            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t)) == -1) return -1;
-          }
+        u32 iter = hashes->salts_buf[salt_pos].salt_iter2;
 
-          if (device_param->is_opencl == true)
-          {
-            if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, device_param->opencl_d_rules_c, innerloop_pos * sizeof (kernel_rule_t), 0, innerloop_left * sizeof (kernel_rule_t), 0, NULL, NULL) == -1) return -1;
-          }
-        }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        u32 loop_step = device_param->kernel_loops;
+
+        for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
         {
-          if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-          {
-            if (user_options->attack_mode == ATTACK_MODE_COMBI)
-            {
-              char *line_buf = device_param->scratch_buf;
+          u32 loop_left = iter - loop_pos;
 
-              u32 i = 0;
+          loop_left = MIN (loop_left, loop_step);
 
-              while (i < innerloop_left)
-              {
-                if (hc_feof (combs_fp)) break;
+          device_param->kernel_params_buf32[28] = loop_pos;
+          device_param->kernel_params_buf32[29] = loop_left;
 
-                size_t line_len = fgetl (combs_fp, line_buf, HCBUFSIZ_LARGE);
+          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2, pws_cnt, true, slow_iteration) == -1) return -1;
 
-                line_len = convert_from_hex (hashcat_ctx, line_buf, line_len);
+          //bug?
+          //while (status_ctx->run_thread_level2 == false) break;
+          if (status_ctx->run_thread_level2 == false) break;
+        }
+      }
+    }
 
-                if (line_len > PW_MAX) continue;
+    if (run_comp == true)
+    {
+      if (hashconfig->opts_type & OPTS_TYPE_DEEP_COMP_KERNEL)
+      {
+        const u32 loops_cnt = hashes->salts_buf[salt_pos].digests_cnt;
 
-                char *line_buf_new = line_buf;
+        for (u32 loops_pos = 0; loops_pos < loops_cnt; loops_pos++)
+        {
+          device_param->kernel_params_buf32[28] = loops_pos;
+          device_param->kernel_params_buf32[29] = loops_cnt;
 
-                char rule_buf_out[RP_PASSWORD_SIZE];
+          const u32 deep_comp_kernel = module_ctx->module_deep_comp_kernel (hashes, salt_pos, loops_pos);
 
-                if (run_rule_engine (user_options_extra->rule_len_r, user_options->rule_buf_r))
-                {
-                  if (line_len >= RP_PASSWORD_SIZE) continue;
+          if (run_kernel (hashcat_ctx, device_param, deep_comp_kernel, pws_cnt, false, 0) == -1) return -1;
 
-                  memset (rule_buf_out, 0, sizeof (rule_buf_out));
+          if (status_ctx->run_thread_level2 == false) break;
+        }
+      }
+      else
+      {
+        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_3, pws_cnt, false, 0) == -1) return -1;
+      }
+    }
 
-                  const int rule_len_out = _old_apply_rule (user_options->rule_buf_r, user_options_extra->rule_len_r, line_buf, (u32) line_len, rule_buf_out);
+    /*
+     * maybe we should add this zero of temporary buffers
+     * however it drops the performance from 7055338 to 7010621
 
-                  if (rule_len_out < 0)
-                  {
-                    status_ctx->words_progress_rejected[salt_pos] += pws_cnt;
+    if (device_param->is_cuda == true)
+    {
+      if (run_cuda_kernel_bzero   (hashcat_ctx, device_param, device_param->cuda_d_tmps,   device_param->size_tmps) == -1) return -1;
+    }
 
-                    continue;
-                  }
+    if (device_param->is_hip == true)
+    {
+      if (run_hip_kernel_bzero   (hashcat_ctx, device_param, device_param->hip_d_tmps,   device_param->size_tmps) == -1) return -1;
+    }
 
-                  line_len = rule_len_out;
+    if (device_param->is_opencl == true)
+    {
+      if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_tmps, device_param->size_tmps) == -1) return -1;
+    }
+    */
 
-                  line_buf_new = rule_buf_out;
-                }
-
-                // do the on-the-fly encoding
-
-                if (iconv_enabled == true)
-                {
-                  char  *iconv_ptr = iconv_tmp;
-                  size_t iconv_sz  = HCBUFSIZ_TINY;
+    if ((hashconfig->opts_type & OPTS_TYPE_HOOK12) || (hashconfig->opts_type & OPTS_TYPE_HOOK23))
+    {
+      if (device_param->is_cuda == true)
+      {
+        if (run_cuda_kernel_bzero   (hashcat_ctx, device_param, device_param->cuda_d_hooks,   pws_cnt * hashconfig->hook_size) == -1) return -1;
+      }
 
-                  if (iconv (iconv_ctx, &line_buf_new, &line_len, &iconv_ptr, &iconv_sz) == (size_t) -1) continue;
+      if (device_param->is_hip == true)
+      {
+        if (run_hip_kernel_bzero   (hashcat_ctx, device_param, device_param->hip_d_hooks,   pws_cnt * hashconfig->hook_size) == -1) return -1;
+      }
 
-                  line_buf_new = iconv_tmp;
-                  line_len     = HCBUFSIZ_TINY - iconv_sz;
-                }
+      if (device_param->is_opencl == true)
+      {
+        if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
+      }
+    }
+  }
 
-                line_len = MIN (line_len, PW_MAX);
+  return 0;
+}
 
-                u8 *ptr = (u8 *) device_param->combs_buf[i].i;
+void rebuild_pws_compressed_append (hc_device_param_t *device_param, const u64 pws_cnt, const u8 chr)
+{
+  // this function is used if we have to modify the compressed pws buffer in order to
+  // append some data to each password candidate
 
-                memcpy (ptr, line_buf_new, line_len);
+  u32      *tmp_pws_comp = (u32 *)      hcmalloc (device_param->size_pws_comp);
+  pw_idx_t *tmp_pws_idx  = (pw_idx_t *) hcmalloc (device_param->size_pws_idx);
 
-                memset (ptr + line_len, 0, PW_MAX - line_len);
+  for (u32 i = 0; i < pws_cnt; i++)
+  {
+    pw_idx_t *pw_idx_src = device_param->pws_idx + i;
+    pw_idx_t *pw_idx_dst = tmp_pws_idx + i;
 
-                if (hashconfig->opts_type & OPTS_TYPE_PT_UPPER)
-                {
-                  uppercase (ptr, line_len);
-                }
+    const u32 src_off = pw_idx_src->off;
+    const u32 src_len = pw_idx_src->len;
 
-                if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_LEFT)
-                {
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
-                  {
-                    ptr[line_len] = 0x80;
-                  }
+    u8 buf[256];
 
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
-                  {
-                    ptr[line_len] = 0x06;
-                  }
+    memcpy (buf, device_param->pws_comp + src_off, src_len);
 
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
-                  {
-                    ptr[line_len] = 0x01;
-                  }
-                }
+    buf[src_len] = chr;
 
-                device_param->combs_buf[i].pw_len = (u32) line_len;
+    const u32 dst_len = src_len + 1;
 
-                i++;
-              }
+    const u32 dst_pw_len4 = (dst_len + 3) & ~3; // round up to multiple of 4
 
-              for (u32 j = i; j < innerloop_left; j++)
-              {
-                memset (&device_param->combs_buf[j], 0, sizeof (pw_t));
-              }
+    const u32 dst_pw_len4_cnt = dst_pw_len4 / 4;
 
-              innerloop_left = i;
+    pw_idx_dst->cnt = dst_pw_len4_cnt;
+    pw_idx_dst->len = src_len; // this is intenionally! src_len can not be dst_len, we dont want the kernel to think 0x80 is part of the password
 
-              if (device_param->is_cuda == true)
-              {
-                if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
-              }
+    u8 *dst = (u8 *) (tmp_pws_comp + pw_idx_dst->off);
 
-              if (device_param->is_opencl == true)
-              {
-                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
-              }
-            }
-            else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
-            {
-              u64 off = innerloop_pos;
+    memcpy (dst, buf, dst_len);
 
-              device_param->kernel_params_mp_buf64[3] = off;
+    memset (dst + dst_len, 0, dst_pw_len4 - dst_len);
 
-              if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left) == -1) return -1;
+    // prepare next element
 
-              if (device_param->is_cuda == true)
-              {
-                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
-              }
+    pw_idx_t *pw_idx_dst_next = pw_idx_dst + 1;
 
-              if (device_param->is_opencl == true)
-              {
-                if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs, device_param->opencl_d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL) == -1) return -1;
-              }
-            }
-            else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
-            {
-              u64 off = innerloop_pos;
+    pw_idx_dst_next->off = pw_idx_dst->off + pw_idx_dst->cnt;
+  }
 
-              device_param->kernel_params_mp_buf64[3] = off;
+  memcpy (device_param->pws_comp, tmp_pws_comp, device_param->size_pws_comp);
+  memcpy (device_param->pws_idx,  tmp_pws_idx,  device_param->size_pws_idx);
 
-              if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left) == -1) return -1;
+  hcfree (tmp_pws_comp);
+  hcfree (tmp_pws_idx);
+}
 
-              if (device_param->is_cuda == true)
-              {
-                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
-              }
+int run_cuda_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num)
+{
+  u64 num_elements = num;
 
-              if (device_param->is_opencl == true)
-              {
-                if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs, device_param->opencl_d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL) == -1) return -1;
-              }
-            }
-          }
-          else
-          {
-            if ((user_options->attack_mode == ATTACK_MODE_COMBI) || (user_options->attack_mode == ATTACK_MODE_HYBRID2))
-            {
-              char *line_buf = device_param->scratch_buf;
+  device_param->kernel_params_atinit[0]       = (void *) &buf;
+  device_param->kernel_params_atinit_buf64[1] = num_elements;
 
-              u32 i = 0;
+  const u64 kernel_threads = device_param->kernel_wgs_atinit;
 
-              while (i < innerloop_left)
-              {
-                if (hc_feof (combs_fp)) break;
+  num_elements = CEILDIV (num_elements, kernel_threads);
 
-                size_t line_len = fgetl (combs_fp, line_buf, HCBUFSIZ_LARGE);
+  CUfunction function = device_param->cuda_function_atinit;
 
-                line_len = convert_from_hex (hashcat_ctx, line_buf, line_len);
+  if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_atinit, NULL) == -1) return -1;
 
-                if (line_len > PW_MAX) continue;
+  if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
 
-                char *line_buf_new = line_buf;
+  return 0;
+}
 
-                char rule_buf_out[RP_PASSWORD_SIZE];
+int run_hip_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u64 num)
+{
+  u64 num_elements = num;
 
-                if (run_rule_engine (user_options_extra->rule_len_r, user_options->rule_buf_r))
-                {
-                  if (line_len >= RP_PASSWORD_SIZE) continue;
+  device_param->kernel_params_atinit[0]       = (void *) &buf;
+  device_param->kernel_params_atinit_buf64[1] = num_elements;
 
-                  memset (rule_buf_out, 0, sizeof (rule_buf_out));
+  const u64 kernel_threads = device_param->kernel_wgs_atinit;
 
-                  const int rule_len_out = _old_apply_rule (user_options->rule_buf_r, user_options_extra->rule_len_r, line_buf, (u32) line_len, rule_buf_out);
+  num_elements = CEILDIV (num_elements, kernel_threads);
 
-                  if (rule_len_out < 0)
-                  {
-                    status_ctx->words_progress_rejected[salt_pos] += pws_cnt;
+  HIPfunction function = device_param->hip_function_atinit;
 
-                    continue;
-                  }
+  if (hc_hipLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->hip_stream, device_param->kernel_params_atinit, NULL) == -1) return -1;
 
-                  line_len = rule_len_out;
+  if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
 
-                  line_buf_new = rule_buf_out;
-                }
+  return 0;
+}
 
-                // do the on-the-fly encoding
+int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u32 value, const u64 size)
+{
+  const u64 num16d = size / 16;
+  const u64 num16m = size % 16;
 
-                if (iconv_enabled == true)
-                {
-                  char  *iconv_ptr = iconv_tmp;
-                  size_t iconv_sz  = HCBUFSIZ_TINY;
+  if (num16d)
+  {
+    device_param->kernel_params_memset[0]       = (void *) &buf;
+    device_param->kernel_params_memset_buf32[1] = value;
+    device_param->kernel_params_memset_buf64[2] = num16d;
 
-                  if (iconv (iconv_ctx, &line_buf_new, &line_len, &iconv_ptr, &iconv_sz) == (size_t) -1) continue;
+    const u64 kernel_threads = device_param->kernel_wgs_memset;
 
-                  line_buf_new = iconv_tmp;
-                  line_len     = HCBUFSIZ_TINY - iconv_sz;
-                }
+    u64 num_elements = num16d;
 
-                line_len = MIN (line_len, PW_MAX);
+    num_elements = CEILDIV (num_elements, kernel_threads);
 
-                u8 *ptr = (u8 *) device_param->combs_buf[i].i;
+    CUfunction function = device_param->cuda_function_memset;
 
-                memcpy (ptr, line_buf_new, line_len);
+    //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem),   (void *) &buf);                         if (CU_rc == -1) return -1;
+    //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (CU_rc == -1) return -1;
+    //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CU_rc == -1) return -1;
 
-                memset (ptr + line_len, 0, PW_MAX - line_len);
+    //const size_t global_work_size[3] = { num_elements,   1, 1 };
+    //const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
 
-                if (hashconfig->opts_type & OPTS_TYPE_PT_UPPER)
-                {
-                  uppercase (ptr, line_len);
-                }
+    if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_memset, NULL) == -1) return -1;
 
-                /*
-                if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_LEFT)
-                {
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
-                  {
-                    ptr[line_len] = 0x80;
-                  }
+    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
+  }
 
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
-                  {
-                    ptr[line_len] = 0x06;
-                  }
+  if (num16m)
+  {
+    u32 tmp[4];
 
-                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
-                  {
-                    ptr[line_len] = 0x01;
-                  }
-                }
-                */
+    tmp[0] = value;
+    tmp[1] = value;
+    tmp[2] = value;
+    tmp[3] = value;
 
-                device_param->combs_buf[i].pw_len = (u32) line_len;
+    // Apparently are allowed to do this: https://devtalk.nvidia.com/default/topic/761515/how-to-copy-to-device-memory-with-offset-/
 
-                i++;
-              }
+    if (hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1;
+  }
 
-              for (u32 j = i; j < innerloop_left; j++)
-              {
-                memset (&device_param->combs_buf[j], 0, sizeof (pw_t));
-              }
+  return 0;
+}
 
-              innerloop_left = i;
+int run_hip_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u32 value, const u64 size)
+{
+  const u64 num16d = size / 16;
+  const u64 num16m = size % 16;
 
-              if (device_param->is_cuda == true)
-              {
-                if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
-              }
-
-              if (device_param->is_opencl == true)
-              {
-                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
-              }
-            }
-            else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
-            {
-              u64 off = innerloop_pos;
-
-              device_param->kernel_params_mp_buf64[3] = off;
-
-              if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left) == -1) return -1;
-
-              if (device_param->is_cuda == true)
-              {
-                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
-              }
-
-              if (device_param->is_opencl == true)
-              {
-                if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs, device_param->opencl_d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL) == -1) return -1;
-              }
-            }
-          }
-        }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
-        {
-          u64 off = innerloop_pos;
-
-          device_param->kernel_params_mp_r_buf64[3] = off;
-
-          if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP_R, innerloop_left) == -1) return -1;
+  if (num16d)
+  {
+    device_param->kernel_params_memset[0]       = (void *) &buf;
+    device_param->kernel_params_memset_buf32[1] = value;
+    device_param->kernel_params_memset_buf64[2] = num16d;
 
-          if (device_param->is_cuda == true)
-          {
-            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_bfs, innerloop_left * sizeof (bf_t)) == -1) return -1;
-          }
+    const u64 kernel_threads = device_param->kernel_wgs_memset;
 
-          if (device_param->is_opencl == true)
-          {
-            if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bfs, device_param->opencl_d_bfs_c, 0, 0, innerloop_left * sizeof (bf_t), 0, NULL, NULL) == -1) return -1;
-          }
-        }
-      }
+    u64 num_elements = num16d;
 
-      if (choose_kernel (hashcat_ctx, device_param, highest_pw_len, pws_cnt, fast_iteration, salt_pos) == -1) return -1;
+    num_elements = CEILDIV (num_elements, kernel_threads);
 
-      /**
-       * benchmark was aborted because too long kernel runtime (slow hashes only)
-       */
+    HIPfunction function = device_param->hip_function_memset;
 
-      if ((user_options->speed_only == true) && (device_param->speed_only_finish == true))
-      {
-        // nothing to do in that case
-      }
-      else
-      {
-        /**
-         * speed
-         */
+    //HIP_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem),   (void *) &buf);                         if (HIP_rc == -1) return -1;
+    //HIP_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (HIP_rc == -1) return -1;
+    //HIP_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (HIP_rc == -1) return -1;
 
-        if (status_ctx->run_thread_level2 == true)
-        {
-          const u64 perf_sum_all = pws_cnt * innerloop_left;
+    //const size_t global_work_size[3] = { num_elements,   1, 1 };
+    //const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
 
-          const double speed_msec = hc_timer_get (device_param->timer_speed);
+    if (hc_hipLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->hip_stream, device_param->kernel_params_memset, NULL) == -1) return -1;
 
-          hc_timer_set (&device_param->timer_speed);
+    if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
+  }
 
-          u32 speed_pos = device_param->speed_pos;
+  if (num16m)
+  {
+    u32 tmp[4];
 
-          device_param->speed_cnt[speed_pos] = perf_sum_all;
+    tmp[0] = value;
+    tmp[1] = value;
+    tmp[2] = value;
+    tmp[3] = value;
 
-          device_param->speed_msec[speed_pos] = speed_msec;
+    // Apparently are allowed to do this: https://devtalk.nvidia.com/default/topic/761515/how-to-copy-to-device-memory-with-offset-/
 
-          speed_pos++;
+    if (hc_hipMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1;
+  }
 
-          if (speed_pos == SPEED_CACHE)
-          {
-            speed_pos = 0;
-          }
+  return 0;
+}
 
-          device_param->speed_pos = speed_pos;
+int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 size)
+{
+  return run_cuda_kernel_memset (hashcat_ctx, device_param, buf, 0, size);
+}
 
-          /**
-           * progress
-           */
+int run_hip_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u64 size)
+{
+  return run_hip_kernel_memset (hashcat_ctx, device_param, buf, 0, size);
+}
 
-          hc_thread_mutex_lock (status_ctx->mux_counter);
+int run_opencl_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num)
+{
+  u64 num_elements = num;
 
-          status_ctx->words_progress_done[salt_pos] += perf_sum_all;
+  device_param->kernel_params_atinit_buf64[1] = num_elements;
 
-          hc_thread_mutex_unlock (status_ctx->mux_counter);
-        }
-      }
+  const u64 kernel_threads = device_param->kernel_wgs_atinit;
 
-      /**
-       * benchmark, part2
-       */
+  num_elements = round_up_multiple_64 (num_elements, kernel_threads);
 
-      if (user_options->speed_only == true)
-      {
-        // let's abort this so that the user doesn't have to wait too long on the result
-        // for slow hashes it's fine anyway as boost mode should be turned on
+  cl_kernel kernel = device_param->opencl_kernel_atinit;
 
-        if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
-        {
-          device_param->speed_only_finish = true;
+  const size_t global_work_size[3] = { num_elements,    1, 1 };
+  const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
 
-          break;
-        }
+  if (hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem), (void *) &buf) == -1) return -1;
 
-        double total_msec = device_param->speed_msec[0];
+  if (hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]) == -1) return -1;
 
-        for (u32 speed_pos = 1; speed_pos < device_param->speed_pos; speed_pos++)
-        {
-          total_msec += device_param->speed_msec[speed_pos];
-        }
+  if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
 
-        if (user_options->slow_candidates == true)
-        {
-          if ((total_msec > 4000) || (device_param->speed_pos == SPEED_CACHE - 1))
-          {
-            const u32 speed_pos = device_param->speed_pos;
+  if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
 
-            if (speed_pos)
-            {
-              device_param->speed_cnt[0]  = device_param->speed_cnt[speed_pos - 1];
-              device_param->speed_msec[0] = device_param->speed_msec[speed_pos - 1];
-            }
+  if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
 
-            device_param->speed_pos = 0;
+  return 0;
+}
 
-            device_param->speed_only_finish = true;
+int run_opencl_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size)
+{
+  const u64 num16d = size / 16;
+  const u64 num16m = size % 16;
 
-            break;
-          }
-        }
-        else
-        {
-          // it's unclear if 4s is enough to turn on boost mode for all backend device
+  if (num16d)
+  {
+    device_param->kernel_params_memset_buf32[1] = value;
+    device_param->kernel_params_memset_buf64[2] = num16d;
 
-          if ((total_msec > 4000) || (device_param->speed_pos == SPEED_CACHE - 1))
-          {
-            device_param->speed_only_finish = true;
+    const u64 kernel_threads = device_param->kernel_wgs_memset;
 
-            break;
-          }
-        }
-      }
+    u64 num_elements = num16d;
 
-      if (device_param->speed_only_finish == true) break;
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
 
-      /**
-       * result
-       */
+    cl_kernel kernel = device_param->opencl_kernel_memset;
 
-      check_cracked (hashcat_ctx, device_param, salt_pos);
+    if (hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem),   (void *) &buf) == -1)                         return -1;
+    if (hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]) == -1) return -1;
+    if (hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]) == -1) return -1;
 
-      if (status_ctx->run_thread_level2 == false) break;
-    }
+    const size_t global_work_size[3] = { num_elements,   1, 1 };
+    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
 
-    if (user_options->speed_only == true) break;
+    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
 
-    //status screen makes use of this, can't reset here
-    //device_param->innerloop_msec = 0;
-    //device_param->innerloop_pos  = 0;
-    //device_param->innerloop_left = 0;
+    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
 
-    if (status_ctx->run_thread_level2 == false) break;
+    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
   }
 
-  //status screen makes use of this, can't reset here
-  //device_param->outerloop_msec = 0;
-  //device_param->outerloop_pos  = 0;
-  //device_param->outerloop_left = 0;
-
-  if (user_options->speed_only == true)
+  if (num16m)
   {
-    double total_msec = device_param->speed_msec[0];
-
-    for (u32 speed_pos = 1; speed_pos < device_param->speed_pos; speed_pos++)
-    {
-      total_msec += device_param->speed_msec[speed_pos];
-    }
+    u32 tmp[4];
 
-    device_param->outerloop_msec = total_msec * hashes->salts_cnt * device_param->outerloop_multi;
+    tmp[0] = value;
+    tmp[1] = value;
+    tmp[2] = value;
+    tmp[3] = value;
 
-    device_param->speed_only_finish = true;
+    if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, buf, CL_TRUE, num16d * 16, num16m, tmp, 0, NULL, NULL) == -1) return -1;
   }
 
   return 0;
 }
 
-int backend_ctx_init (hashcat_ctx_t *hashcat_ctx)
+int run_opencl_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size)
 {
-  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
-  user_options_t *user_options = hashcat_ctx->user_options;
-
-  backend_ctx->enabled = false;
-
-  if (user_options->example_hashes == true) return 0;
-  if (user_options->keyspace       == true) return 0;
-  if (user_options->left           == true) return 0;
-  if (user_options->show           == true) return 0;
-  if (user_options->usage          == true) return 0;
-  if (user_options->version        == true) return 0;
-
-  hc_device_param_t *devices_param = (hc_device_param_t *) hccalloc (DEVICES_MAX, sizeof (hc_device_param_t));
-
-  backend_ctx->devices_param = devices_param;
+  return run_opencl_kernel_memset (hashcat_ctx, device_param, buf, 0, size);
+}
 
-  /**
-   * Load and map CUDA library calls, then init CUDA
-   */
+int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num, const u32 event_update, const u32 iteration)
+{
+  const hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
+  const status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
 
-  int rc_cuda_init = -1;
+  u64 kernel_threads = 0;
+  u64 dynamic_shared_mem = 0;
 
-  if (user_options->backend_ignore_cuda == false)
+  switch (kern_run)
   {
-    CUDA_PTR *cuda = (CUDA_PTR *) hcmalloc (sizeof (CUDA_PTR));
-
-    backend_ctx->cuda = cuda;
-
-    rc_cuda_init = cuda_init (hashcat_ctx);
+    case KERN_RUN_1:
+      kernel_threads     = device_param->kernel_wgs1;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size1;
+      break;
+    case KERN_RUN_12:
+      kernel_threads     = device_param->kernel_wgs12;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size12;
+      break;
+    case KERN_RUN_2:
+      kernel_threads     = device_param->kernel_wgs2;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size2;
+      break;
+    case KERN_RUN_2E:
+      kernel_threads     = device_param->kernel_wgs2e;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size2e;
+      break;
+    case KERN_RUN_23:
+      kernel_threads     = device_param->kernel_wgs23;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size23;
+      break;
+    case KERN_RUN_3:
+      kernel_threads     = device_param->kernel_wgs3;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size3;
+      break;
+    case KERN_RUN_4:
+      kernel_threads     = device_param->kernel_wgs4;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size4;
+      break;
+    case KERN_RUN_INIT2:
+      kernel_threads     = device_param->kernel_wgs_init2;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_init2;
+      break;
+    case KERN_RUN_LOOP2:
+      kernel_threads     = device_param->kernel_wgs_loop2;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_loop2;
+      break;
+    case KERN_RUN_AUX1:
+      kernel_threads     = device_param->kernel_wgs_aux1;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux1;
+      break;
+    case KERN_RUN_AUX2:
+      kernel_threads     = device_param->kernel_wgs_aux2;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux2;
+      break;
+    case KERN_RUN_AUX3:
+      kernel_threads     = device_param->kernel_wgs_aux3;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux3;
+      break;
+    case KERN_RUN_AUX4:
+      kernel_threads     = device_param->kernel_wgs_aux4;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux4;
+      break;
+  }
 
-    if (rc_cuda_init == -1)
+  if ((hashconfig->opts_type & OPTS_TYPE_DYNAMIC_SHARED) == 0)
+  {
+    dynamic_shared_mem = 0;
+  }
+
+  if (device_param->is_cuda == true)
+  {
+    if ((device_param->kernel_dynamic_local_mem_size_memset % device_param->device_local_mem_size) == 0)
     {
-      cuda_close (hashcat_ctx);
+      // this is the case Compute Capability 7.5
+      // there is also Compute Capability 7.0 which offers a larger dynamic local size access
+      // however, if it's an exact multiple the driver can optimize this for us more efficient
+
+      dynamic_shared_mem = 0;
     }
+  }
 
-    /**
-     * Load and map NVRTC library calls
-     */
+  if (device_param->is_hip == true)
+  {
+    if ((device_param->kernel_dynamic_local_mem_size_memset % device_param->device_local_mem_size) == 0)
+    {
+      dynamic_shared_mem = 0;
+    }
+  }
 
-    NVRTC_PTR *nvrtc = (NVRTC_PTR *) hcmalloc (sizeof (NVRTC_PTR));
+  kernel_threads = MIN (kernel_threads, device_param->kernel_threads);
 
-    backend_ctx->nvrtc = nvrtc;
+  device_param->kernel_params_buf64[34] = num;
 
-    int rc_nvrtc_init = nvrtc_init (hashcat_ctx);
+  u64 num_elements = num;
 
-    if (rc_nvrtc_init == -1)
+  if (device_param->is_cuda == true)
+  {
+    CUfunction cuda_function = NULL;
+
+    if (device_param->is_cuda == true)
     {
-      nvrtc_close (hashcat_ctx);
+      switch (kern_run)
+      {
+        case KERN_RUN_1:      cuda_function = device_param->cuda_function1;      break;
+        case KERN_RUN_12:     cuda_function = device_param->cuda_function12;     break;
+        case KERN_RUN_2:      cuda_function = device_param->cuda_function2;      break;
+        case KERN_RUN_2E:     cuda_function = device_param->cuda_function2e;     break;
+        case KERN_RUN_23:     cuda_function = device_param->cuda_function23;     break;
+        case KERN_RUN_3:      cuda_function = device_param->cuda_function3;      break;
+        case KERN_RUN_4:      cuda_function = device_param->cuda_function4;      break;
+        case KERN_RUN_INIT2:  cuda_function = device_param->cuda_function_init2; break;
+        case KERN_RUN_LOOP2:  cuda_function = device_param->cuda_function_loop2; break;
+        case KERN_RUN_AUX1:   cuda_function = device_param->cuda_function_aux1;  break;
+        case KERN_RUN_AUX2:   cuda_function = device_param->cuda_function_aux2;  break;
+        case KERN_RUN_AUX3:   cuda_function = device_param->cuda_function_aux3;  break;
+        case KERN_RUN_AUX4:   cuda_function = device_param->cuda_function_aux4;  break;
+      }
+
+      if (hc_cuFuncSetAttribute (hashcat_ctx, cuda_function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, dynamic_shared_mem) == -1) return -1;
     }
 
-    /**
-     * Check if both CUDA and NVRTC were load successful
-     */
+    if (kernel_threads == 0) kernel_threads = 1;
 
-    if ((rc_cuda_init == 0) && (rc_nvrtc_init == 0))
-    {
-      // nvrtc version
+    num_elements = CEILDIV (num_elements, kernel_threads);
 
-      int nvrtc_major = 0;
-      int nvrtc_minor = 0;
+    if (kern_run == KERN_RUN_1)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
+      {
+        num_elements = CEILDIV (num_elements, device_param->vector_width);
+      }
+    }
+    else if (kern_run == KERN_RUN_2)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
+      {
+        num_elements = CEILDIV (num_elements, device_param->vector_width);
+      }
+    }
+    else if (kern_run == KERN_RUN_3)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
+      {
+        num_elements = CEILDIV (num_elements, device_param->vector_width);
+      }
+    }
 
-      if (hc_nvrtcVersion (hashcat_ctx, &nvrtc_major, &nvrtc_minor) == -1) return -1;
+    if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event1, device_param->cuda_stream) == -1) return -1;
 
-      int nvrtc_driver_version = (nvrtc_major * 1000) + (nvrtc_minor * 10);
+    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
 
-      backend_ctx->nvrtc_driver_version = nvrtc_driver_version;
+    if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event2, device_param->cuda_stream) == -1) return -1;
 
-      if (nvrtc_driver_version < 9000)
-      {
-        event_log_error (hashcat_ctx, "Outdated NVIDIA NVRTC driver version '%d' detected!", nvrtc_driver_version);
+    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
 
-        event_log_warning (hashcat_ctx, "See hashcat.net for officially supported NVIDIA CUDA Toolkit versions.");
-        event_log_warning (hashcat_ctx, NULL);
+    if (hc_cuEventSynchronize (hashcat_ctx, device_param->cuda_event2) == -1) return -1;
 
-        return -1;
-      }
+    float exec_ms;
 
-      // cuda version
+    if (hc_cuEventElapsedTime (hashcat_ctx, &exec_ms, device_param->cuda_event1, device_param->cuda_event2) == -1) return -1;
 
-      int cuda_driver_version = 0;
+    if (event_update)
+    {
+      u32 exec_pos = device_param->exec_pos;
 
-      if (hc_cuDriverGetVersion (hashcat_ctx, &cuda_driver_version) == -1) return -1;
+      device_param->exec_msec[exec_pos] = exec_ms;
 
-      backend_ctx->cuda_driver_version = cuda_driver_version;
+      exec_pos++;
 
-      if (cuda_driver_version < 9000)
+      if (exec_pos == EXEC_CACHE)
       {
-        event_log_error (hashcat_ctx, "Outdated NVIDIA CUDA driver version '%d' detected!", cuda_driver_version);
-
-        event_log_warning (hashcat_ctx, "See hashcat.net for officially supported NVIDIA CUDA Toolkit versions.");
-        event_log_warning (hashcat_ctx, NULL);
-
-        return -1;
+        exec_pos = 0;
       }
-    }
-    else
-    {
-      rc_cuda_init  = -1;
-      rc_nvrtc_init = -1;
 
-      cuda_close  (hashcat_ctx);
-      nvrtc_close (hashcat_ctx);
+      device_param->exec_pos = exec_pos;
     }
   }
 
-  /**
-   * Load and map OpenCL library calls
-   */
+  /*
+  * HIP
+  */
+  if (device_param->is_hip == true)
+  {
+    HIPfunction hip_function = NULL;
 
-  int rc_ocl_init = -1;
+    if (device_param->is_hip == true)
+    {
+      switch (kern_run)
+      {
+        case KERN_RUN_1:      hip_function = device_param->hip_function1;      break;
+        case KERN_RUN_12:     hip_function = device_param->hip_function12;     break;
+        case KERN_RUN_2:      hip_function = device_param->hip_function2;      break;
+        case KERN_RUN_2E:     hip_function = device_param->hip_function2e;     break;
+        case KERN_RUN_23:     hip_function = device_param->hip_function23;     break;
+        case KERN_RUN_3:      hip_function = device_param->hip_function3;      break;
+        case KERN_RUN_4:      hip_function = device_param->hip_function4;      break;
+        case KERN_RUN_INIT2:  hip_function = device_param->hip_function_init2; break;
+        case KERN_RUN_LOOP2:  hip_function = device_param->hip_function_loop2; break;
+        case KERN_RUN_AUX1:   hip_function = device_param->hip_function_aux1;  break;
+        case KERN_RUN_AUX2:   hip_function = device_param->hip_function_aux2;  break;
+        case KERN_RUN_AUX3:   hip_function = device_param->hip_function_aux3;  break;
+        case KERN_RUN_AUX4:   hip_function = device_param->hip_function_aux4;  break;
+      }
 
-  if (user_options->backend_ignore_opencl == false)
-  {
-    OCL_PTR *ocl = (OCL_PTR *) hcmalloc (sizeof (OCL_PTR));
+      if (hc_hipFuncSetAttribute (hashcat_ctx, hip_function, HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, dynamic_shared_mem) == -1) return -1;
+    }
 
-    backend_ctx->ocl = ocl;
+    if (kernel_threads == 0) kernel_threads = 1;
 
-    rc_ocl_init = ocl_init (hashcat_ctx);
+    num_elements = CEILDIV (num_elements, kernel_threads);
 
-    if (rc_ocl_init == -1)
+    if (kern_run == KERN_RUN_1)
     {
-      ocl_close (hashcat_ctx);
+      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
+      {
+        num_elements = CEILDIV (num_elements, device_param->vector_width);
+      }
     }
-
-    /**
-     * return if both CUDA and OpenCL initialization failed
-     */
-
-    if ((rc_cuda_init == -1) && (rc_ocl_init == -1))
+    else if (kern_run == KERN_RUN_2)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
+      {
+        num_elements = CEILDIV (num_elements, device_param->vector_width);
+      }
+    }
+    else if (kern_run == KERN_RUN_3)
     {
-      event_log_error (hashcat_ctx, "ATTENTION! No OpenCL or CUDA installation found.");
+      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
+      {
+        num_elements = CEILDIV (num_elements, device_param->vector_width);
+      }
+    }
 
-      event_log_warning (hashcat_ctx, "You are probably missing the CUDA or OpenCL runtime installation.");
-      event_log_warning (hashcat_ctx, NULL);
+    if (hc_hipEventRecord (hashcat_ctx, device_param->hip_event1, device_param->hip_stream) == -1) return -1;
 
-      #if defined (__linux__)
-      event_log_warning (hashcat_ctx, "* AMD GPUs on Linux require this driver:");
-      event_log_warning (hashcat_ctx, "  \"RadeonOpenCompute (ROCm)\" Software Platform (3.1 or later)");
-      #elif defined (_WIN)
-      event_log_warning (hashcat_ctx, "* AMD GPUs on Windows require this driver:");
-      event_log_warning (hashcat_ctx, "  \"AMD Radeon Adrenalin 2020 Edition\" (20.2.2 or later)");
-      #endif
+    if (hc_hipLaunchKernel (hashcat_ctx, hip_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->hip_stream, device_param->kernel_params, NULL) == -1) return -1;
 
-      event_log_warning (hashcat_ctx, "* Intel CPUs require this runtime:");
-      event_log_warning (hashcat_ctx, "  \"OpenCL Runtime for Intel Core and Intel Xeon Processors\" (16.1.1 or later)");
+    if (hc_hipEventRecord (hashcat_ctx, device_param->hip_event2, device_param->hip_stream) == -1) return -1;
 
-      event_log_warning (hashcat_ctx, "* NVIDIA GPUs require this runtime and/or driver (both):");
-      event_log_warning (hashcat_ctx, "  \"NVIDIA Driver\" (440.64 or later)");
-      event_log_warning (hashcat_ctx, "  \"CUDA Toolkit\" (9.0 or later)");
-      event_log_warning (hashcat_ctx, NULL);
+    if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
 
-      return -1;
-    }
+    if (hc_hipEventSynchronize (hashcat_ctx, device_param->hip_event2) == -1) return -1;
 
-    /**
-     * Some permission pre-check, because AMDGPU-PRO Driver crashes if the user has no permission to do this
-     */
+    float exec_ms;
 
-    if (ocl_check_dri (hashcat_ctx) == -1) return -1;
-  }
+    if (hc_hipEventElapsedTime (hashcat_ctx, &exec_ms, device_param->hip_event1, device_param->hip_event2) == -1) return -1;
 
-  /**
-   * Backend device selection
-   */
+    if (event_update)
+    {
+      u32 exec_pos = device_param->exec_pos;
 
-  u64 backend_devices_filter;
+      device_param->exec_msec[exec_pos] = exec_ms;
 
-  if (setup_backend_devices_filter (hashcat_ctx, user_options->backend_devices, &backend_devices_filter) == false) return -1;
-
-  backend_ctx->backend_devices_filter = backend_devices_filter;
+      exec_pos++;
 
-  /**
-   * OpenCL device type selection
-   */
+      if (exec_pos == EXEC_CACHE)
+      {
+        exec_pos = 0;
+      }
 
-  cl_device_type opencl_device_types_filter;
+      device_param->exec_pos = exec_pos;
+    }
+  }
 
-  if (setup_opencl_device_types_filter (hashcat_ctx, user_options->opencl_device_types, &opencl_device_types_filter) == false) return -1;
+  /*
+  * OCL
+  */
+  if (device_param->is_opencl == true)
+  {
+    cl_kernel opencl_kernel = NULL;
 
-  backend_ctx->opencl_device_types_filter = opencl_device_types_filter;
+    if (device_param->is_opencl == true)
+    {
+      switch (kern_run)
+      {
+        case KERN_RUN_1:      opencl_kernel = device_param->opencl_kernel1;      break;
+        case KERN_RUN_12:     opencl_kernel = device_param->opencl_kernel12;     break;
+        case KERN_RUN_2:      opencl_kernel = device_param->opencl_kernel2;      break;
+        case KERN_RUN_2E:     opencl_kernel = device_param->opencl_kernel2e;     break;
+        case KERN_RUN_23:     opencl_kernel = device_param->opencl_kernel23;     break;
+        case KERN_RUN_3:      opencl_kernel = device_param->opencl_kernel3;      break;
+        case KERN_RUN_4:      opencl_kernel = device_param->opencl_kernel4;      break;
+        case KERN_RUN_INIT2:  opencl_kernel = device_param->opencl_kernel_init2; break;
+        case KERN_RUN_LOOP2:  opencl_kernel = device_param->opencl_kernel_loop2; break;
+        case KERN_RUN_AUX1:   opencl_kernel = device_param->opencl_kernel_aux1;  break;
+        case KERN_RUN_AUX2:   opencl_kernel = device_param->opencl_kernel_aux2;  break;
+        case KERN_RUN_AUX3:   opencl_kernel = device_param->opencl_kernel_aux3;  break;
+        case KERN_RUN_AUX4:   opencl_kernel = device_param->opencl_kernel_aux4;  break;
+      }
+    }
 
-  /**
-   * CUDA API: init
-   */
+    for (u32 i = 0; i <= 23; i++)
+    {
+      if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_mem), device_param->kernel_params[i]) == -1) return -1;
+    }
 
-  if (backend_ctx->cuda)
-  {
-    if (hc_cuInit (hashcat_ctx, 0) == -1)
+    for (u32 i = 24; i <= 33; i++)
     {
-      cuda_close (hashcat_ctx);
+      if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_uint), device_param->kernel_params[i]) == -1) return -1;
     }
-  }
 
-  /**
-   * OpenCL API: init
-   */
+    for (u32 i = 34; i <= 34; i++)
+    {
+      if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]) == -1) return -1;
+    }
 
-  if (backend_ctx->ocl)
-  {
-    #define FREE_OPENCL_CTX_ON_ERROR          \
-    do {                                      \
-      hcfree (opencl_platforms);              \
-      hcfree (opencl_platforms_devices);      \
-      hcfree (opencl_platforms_devices_cnt);  \
-      hcfree (opencl_platforms_name);         \
-      hcfree (opencl_platforms_vendor);       \
-      hcfree (opencl_platforms_vendor_id);    \
-      hcfree (opencl_platforms_version);      \
-    } while (0)
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
 
-    cl_platform_id *opencl_platforms             = (cl_platform_id *) hccalloc (CL_PLATFORMS_MAX, sizeof (cl_platform_id));
-    cl_uint         opencl_platforms_cnt         = 0;
-    cl_device_id  **opencl_platforms_devices     = (cl_device_id **)  hccalloc (CL_PLATFORMS_MAX, sizeof (cl_device_id *));
-    cl_uint        *opencl_platforms_devices_cnt = (cl_uint *)        hccalloc (CL_PLATFORMS_MAX, sizeof (cl_uint));
-    char          **opencl_platforms_name        = (char **)          hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
-    char          **opencl_platforms_vendor      = (char **)          hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
-    cl_uint        *opencl_platforms_vendor_id   = (cl_uint *)        hccalloc (CL_PLATFORMS_MAX, sizeof (cl_uint));
-    char          **opencl_platforms_version     = (char **)          hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
+    cl_event opencl_event;
 
-    if (hc_clGetPlatformIDs (hashcat_ctx, CL_PLATFORMS_MAX, opencl_platforms, &opencl_platforms_cnt) == -1)
+    if (kern_run == KERN_RUN_1)
     {
-      opencl_platforms_cnt = 0;
-
-      FREE_OPENCL_CTX_ON_ERROR;
-
-      ocl_close (hashcat_ctx);
+      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
+      {
+        num_elements = CEILDIV (num_elements, device_param->vector_width);
+      }
     }
-
-    if (opencl_platforms_cnt)
+    else if (kern_run == KERN_RUN_2)
     {
-      for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
+      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
       {
-        cl_platform_id opencl_platform = opencl_platforms[opencl_platforms_idx];
+        num_elements = CEILDIV (num_elements, device_param->vector_width);
+      }
+    }
+    else if (kern_run == KERN_RUN_3)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
+      {
+        num_elements = CEILDIV (num_elements, device_param->vector_width);
+      }
+    }
 
-        size_t param_value_size = 0;
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
 
-        // platform vendor
+    const size_t global_work_size[3] = { num_elements,   1, 1 };
+    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
 
-        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VENDOR, 0, NULL, &param_value_size) == -1) return -1;
+    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event) == -1) return -1;
 
-        char *opencl_platform_vendor = (char *) hcmalloc (param_value_size);
+    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
 
-        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VENDOR, param_value_size, opencl_platform_vendor, NULL) == -1) return -1;
+    // spin damper section
 
-        opencl_platforms_vendor[opencl_platforms_idx] = opencl_platform_vendor;
+    const u32 iterationm = iteration % EXPECTED_ITERATIONS;
 
-        // platform name
+    if (device_param->spin_damp > 0)
+    {
+      cl_int opencl_event_status;
 
-        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_NAME, 0, NULL, &param_value_size) == -1) return -1;
+      size_t param_value_size_ret;
 
-        char *opencl_platform_name = (char *) hcmalloc (param_value_size);
+      if (hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret) == -1) return -1;
 
-        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_NAME, param_value_size, opencl_platform_name, NULL) == -1) return -1;
+      double spin_total = device_param->spin_damp;
 
-        opencl_platforms_name[opencl_platforms_idx] = opencl_platform_name;
+      while (opencl_event_status != CL_COMPLETE)
+      {
+        if (status_ctx->devices_status == STATUS_RUNNING)
+        {
+          switch (kern_run)
+          {
+            case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_2E:     if (device_param->exec_us_prev2e[iterationm]     > 0) usleep ((useconds_t) (device_param->exec_us_prev2e[iterationm]     * device_param->spin_damp)); break;
+            case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]  * device_param->spin_damp)); break;
+          }
+        }
+        else
+        {
+          // we were told to be nice
 
-        // platform version
+          sleep (0);
+        }
 
-        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VERSION, 0, NULL, &param_value_size) == -1) return -1;
+        if (hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret) == -1) return -1;
 
-        char *opencl_platform_version = (char *) hcmalloc (param_value_size);
+        spin_total += device_param->spin_damp;
 
-        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VERSION, param_value_size, opencl_platform_version, NULL) == -1) return -1;
+        if (spin_total > 1) break;
+      }
+    }
 
-        opencl_platforms_version[opencl_platforms_idx] = opencl_platform_version;
+    if (hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event) == -1) return -1;
 
-        // find our own platform vendor because pocl and mesa are pushing original vendor_id through opencl
-        // this causes trouble with vendor id based macros
-        // we'll assign generic to those without special optimization available
+    cl_ulong time_start;
+    cl_ulong time_end;
 
-        cl_uint opencl_platform_vendor_id = 0;
+    if (hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL) == -1) return -1;
+    if (hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_END,   sizeof (time_end),   &time_end,   NULL) == -1) return -1;
 
-        if (strcmp (opencl_platform_vendor, CL_VENDOR_AMD1) == 0)
-        {
-          opencl_platform_vendor_id = VENDOR_ID_AMD;
-        }
-        else if (strcmp (opencl_platform_vendor, CL_VENDOR_AMD2) == 0)
-        {
-          opencl_platform_vendor_id = VENDOR_ID_AMD;
-        }
-        else if (strcmp (opencl_platform_vendor, CL_VENDOR_AMD_USE_INTEL) == 0)
-        {
-          opencl_platform_vendor_id = VENDOR_ID_AMD_USE_INTEL;
-        }
-        else if (strcmp (opencl_platform_vendor, CL_VENDOR_APPLE) == 0)
-        {
-          opencl_platform_vendor_id = VENDOR_ID_APPLE;
-        }
-        else if (strcmp (opencl_platform_vendor, CL_VENDOR_INTEL_BEIGNET) == 0)
-        {
-          opencl_platform_vendor_id = VENDOR_ID_INTEL_BEIGNET;
-        }
-        else if (strcmp (opencl_platform_vendor, CL_VENDOR_INTEL_SDK) == 0)
-        {
-          opencl_platform_vendor_id = VENDOR_ID_INTEL_SDK;
-        }
-        else if (strcmp (opencl_platform_vendor, CL_VENDOR_MESA) == 0)
-        {
-          opencl_platform_vendor_id = VENDOR_ID_MESA;
-        }
-        else if (strcmp (opencl_platform_vendor, CL_VENDOR_NV) == 0)
-        {
-          opencl_platform_vendor_id = VENDOR_ID_NV;
-        }
-        else if (strcmp (opencl_platform_vendor, CL_VENDOR_POCL) == 0)
-        {
-          opencl_platform_vendor_id = VENDOR_ID_POCL;
-        }
-        else
+    const double exec_us = (double) (time_end - time_start) / 1000;
+
+    if (device_param->spin_damp > 0)
+    {
+      if (status_ctx->devices_status == STATUS_RUNNING)
+      {
+        switch (kern_run)
         {
-          opencl_platform_vendor_id = VENDOR_ID_GENERIC;
+          case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]      = exec_us; break;
+          case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]      = exec_us; break;
+          case KERN_RUN_2E:     device_param->exec_us_prev2e[iterationm]     = exec_us; break;
+          case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]      = exec_us; break;
+          case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]      = exec_us; break;
+          case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm] = exec_us; break;
+          case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm] = exec_us; break;
+          case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]  = exec_us; break;
         }
+      }
+    }
 
-        opencl_platforms_vendor_id[opencl_platforms_idx] = opencl_platform_vendor_id;
+    if (event_update)
+    {
+      u32 exec_pos = device_param->exec_pos;
 
-        cl_device_id *opencl_platform_devices = (cl_device_id *) hccalloc (DEVICES_MAX, sizeof (cl_device_id));
+      device_param->exec_msec[exec_pos] = exec_us / 1000;
 
-        cl_uint opencl_platform_devices_cnt = 0;
+      exec_pos++;
 
-        const int CL_rc = hc_clGetDeviceIDs (hashcat_ctx, opencl_platform, CL_DEVICE_TYPE_ALL, DEVICES_MAX, opencl_platform_devices, &opencl_platform_devices_cnt);
+      if (exec_pos == EXEC_CACHE)
+      {
+        exec_pos = 0;
+      }
 
-        if (CL_rc == -1)
-        {
-          event_log_error (hashcat_ctx, "clGetDeviceIDs(): %s", val2cstr_cl (CL_rc));
+      device_param->exec_pos = exec_pos;
+    }
 
-          // Special handling for CL_DEVICE_NOT_FOUND, see: https://github.com/hashcat/hashcat/issues/2455
+    if (hc_clReleaseEvent (hashcat_ctx, opencl_event) == -1) return -1;
 
-          #define IGNORE_DEVICE_NOT_FOUND 1
+    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+  }
 
-          if (IGNORE_DEVICE_NOT_FOUND)
-          {
-            backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  return 0;
+}
 
-            OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num)
+{
+  u64 kernel_threads = 0;
 
-            const cl_int CL_err = ocl->clGetDeviceIDs (opencl_platform, CL_DEVICE_TYPE_ALL, DEVICES_MAX, opencl_platform_devices, &opencl_platform_devices_cnt);
+  switch (kern_run)
+  {
+    case KERN_RUN_MP:   kernel_threads  = device_param->kernel_wgs_mp;    break;
+    case KERN_RUN_MP_R: kernel_threads  = device_param->kernel_wgs_mp_r;  break;
+    case KERN_RUN_MP_L: kernel_threads  = device_param->kernel_wgs_mp_l;  break;
+  }
 
-            if (CL_err == CL_DEVICE_NOT_FOUND)
-            {
-              // we ignore this error
-            }
-            else
-            {
-              return -1;
-            }
-          }
-          else
-          {
-            return -1;
-          }
-        }
+  u64 num_elements = num;
 
-        opencl_platforms_devices[opencl_platforms_idx] = opencl_platform_devices;
+  switch (kern_run)
+  {
+    case KERN_RUN_MP:   device_param->kernel_params_mp_buf64[8]   = num; break;
+    case KERN_RUN_MP_R: device_param->kernel_params_mp_r_buf64[8] = num; break;
+    case KERN_RUN_MP_L: device_param->kernel_params_mp_l_buf64[9] = num; break;
+  }
 
-        opencl_platforms_devices_cnt[opencl_platforms_idx] = opencl_platform_devices_cnt;
-      }
+  if (device_param->is_cuda == true)
+  {
+    CUfunction cuda_function = NULL;
 
-      if (user_options->opencl_device_types == NULL)
-      {
-        /**
-         * OpenCL device types:
-         *   In case the user did not specify --opencl-device-types and the user runs hashcat in a system with only a CPU only he probably want to use that CPU.
-         */
+    void **cuda_args = NULL;
 
-        cl_device_type opencl_device_types_all = 0;
+    switch (kern_run)
+    {
+      case KERN_RUN_MP:   cuda_function = device_param->cuda_function_mp;
+                          cuda_args     = device_param->kernel_params_mp;
+                          break;
+      case KERN_RUN_MP_R: cuda_function = device_param->cuda_function_mp_r;
+                          cuda_args     = device_param->kernel_params_mp_r;
+                          break;
+      case KERN_RUN_MP_L: cuda_function = device_param->cuda_function_mp_l;
+                          cuda_args     = device_param->kernel_params_mp_l;
+                          break;
+    }
 
-        for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
-        {
-          cl_device_id *opencl_platform_devices     = opencl_platforms_devices[opencl_platforms_idx];
-          cl_uint       opencl_platform_devices_cnt = opencl_platforms_devices_cnt[opencl_platforms_idx];
+    num_elements = CEILDIV (num_elements, kernel_threads);
 
-          for (u32 opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++)
-          {
-            cl_device_id opencl_device = opencl_platform_devices[opencl_platform_devices_idx];
+    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, cuda_args, NULL) == -1) return -1;
 
-            cl_device_type opencl_device_type;
+    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
+  }
 
-            if (hc_clGetDeviceInfo (hashcat_ctx, opencl_device, CL_DEVICE_TYPE, sizeof (opencl_device_type), &opencl_device_type, NULL) == -1)
-            {
-              FREE_OPENCL_CTX_ON_ERROR;
+  /*
+  * HIP
+  */
+  if (device_param->is_hip == true)
+  {
+    HIPfunction hip_function = NULL;
 
-              return -1;
-            }
+    void **hip_args = NULL;
 
-            opencl_device_types_all |= opencl_device_type;
-          }
-        }
+    switch (kern_run)
+    {
+      case KERN_RUN_MP:   hip_function = device_param->hip_function_mp;
+                          hip_args     = device_param->kernel_params_mp;
+                          break;
+      case KERN_RUN_MP_R: hip_function = device_param->hip_function_mp_r;
+                          hip_args     = device_param->kernel_params_mp_r;
+                          break;
+      case KERN_RUN_MP_L: hip_function = device_param->hip_function_mp_l;
+                          hip_args     = device_param->kernel_params_mp_l;
+                          break;
+    }
 
-        // In such a case, automatically enable CPU device type support, since it's disabled by default.
+    num_elements = CEILDIV (num_elements, kernel_threads);
 
-        if ((opencl_device_types_all & (CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR)) == 0)
-        {
-          opencl_device_types_filter |= CL_DEVICE_TYPE_CPU;
-        }
+    if (hc_hipLaunchKernel (hashcat_ctx, hip_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->hip_stream, hip_args, NULL) == -1) return -1;
 
-        // In another case, when the user uses --stdout, using CPU devices is much faster to setup
-        // If we have a CPU device, force it to be used
+    if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
+  }
 
-        if (user_options->stdout_flag == true)
-        {
-          if (opencl_device_types_all & CL_DEVICE_TYPE_CPU)
-          {
-            opencl_device_types_filter = CL_DEVICE_TYPE_CPU;
-          }
-        }
+  /*
+  * OCL
+  */
+  if (device_param->is_opencl == true)
+  {
+    cl_kernel opencl_kernel = NULL;
 
-        backend_ctx->opencl_device_types_filter = opencl_device_types_filter;
-      }
+    switch (kern_run)
+    {
+      case KERN_RUN_MP:   opencl_kernel = device_param->opencl_kernel_mp;   break;
+      case KERN_RUN_MP_R: opencl_kernel = device_param->opencl_kernel_mp_r; break;
+      case KERN_RUN_MP_L: opencl_kernel = device_param->opencl_kernel_mp_l; break;
     }
 
-    backend_ctx->opencl_platforms             = opencl_platforms;
-    backend_ctx->opencl_platforms_cnt         = opencl_platforms_cnt;
-    backend_ctx->opencl_platforms_devices     = opencl_platforms_devices;
-    backend_ctx->opencl_platforms_devices_cnt = opencl_platforms_devices_cnt;
-    backend_ctx->opencl_platforms_name        = opencl_platforms_name;
-    backend_ctx->opencl_platforms_vendor      = opencl_platforms_vendor;
-    backend_ctx->opencl_platforms_vendor_id   = opencl_platforms_vendor_id;
-    backend_ctx->opencl_platforms_version     = opencl_platforms_version;
-
-    #undef FREE_OPENCL_CTX_ON_ERROR
-  }
-
-  /**
-   * Final checks
-   */
-
-  if ((backend_ctx->cuda == NULL) && (backend_ctx->ocl == NULL))
-  {
-    event_log_error (hashcat_ctx, "ATTENTION! No OpenCL-compatible or CUDA-compatible platform found.");
+    switch (kern_run)
+    {
+      case KERN_RUN_MP:   if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp[3]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp[4]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp[5]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp[6]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp[7]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 8, sizeof (cl_ulong), device_param->kernel_params_mp[8]) == -1) return -1;
+                          break;
+      case KERN_RUN_MP_R: if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp_r[3]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp_r[4]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp_r[5]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp_r[6]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp_r[7]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 8, sizeof (cl_ulong), device_param->kernel_params_mp_r[8]) == -1) return -1;
+                          break;
+      case KERN_RUN_MP_L: if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_mp_l[3]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 4, sizeof (cl_uint),  device_param->kernel_params_mp_l[4]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 5, sizeof (cl_uint),  device_param->kernel_params_mp_l[5]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_uint),  device_param->kernel_params_mp_l[6]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 7, sizeof (cl_uint),  device_param->kernel_params_mp_l[7]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 8, sizeof (cl_uint),  device_param->kernel_params_mp_l[8]) == -1) return -1;
+                          if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 9, sizeof (cl_ulong), device_param->kernel_params_mp_l[9]) == -1) return -1;
+                          break;
+    }
 
-    event_log_warning (hashcat_ctx, "You are probably missing the OpenCL or CUDA runtime installation.");
-    event_log_warning (hashcat_ctx, NULL);
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
 
-    #if defined (__linux__)
-    event_log_warning (hashcat_ctx, "* AMD GPUs on Linux require this driver:");
-    event_log_warning (hashcat_ctx, "  \"RadeonOpenCompute (ROCm)\" Software Platform (3.1 or later)");
-    #elif defined (_WIN)
-    event_log_warning (hashcat_ctx, "* AMD GPUs on Windows require this driver:");
-    event_log_warning (hashcat_ctx, "  \"AMD Radeon Adrenalin 2020 Edition\" (20.2.2 or later)");
-    #endif
+    const size_t global_work_size[3] = { num_elements,   1, 1 };
+    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
 
-    event_log_warning (hashcat_ctx, "* Intel CPUs require this runtime:");
-    event_log_warning (hashcat_ctx, "  \"OpenCL Runtime for Intel Core and Intel Xeon Processors\" (16.1.1 or later)");
+    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
 
-    event_log_warning (hashcat_ctx, "* NVIDIA GPUs require this runtime and/or driver (both):");
-    event_log_warning (hashcat_ctx, "  \"NVIDIA Driver\" (440.64 or later)");
-    event_log_warning (hashcat_ctx, "  \"CUDA Toolkit\" (9.0 or later)");
-    event_log_warning (hashcat_ctx, NULL);
+    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue)  == -1) return -1;
 
-    return -1;
+    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
   }
 
-  backend_ctx->enabled = true;
-
   return 0;
 }
 
-void backend_ctx_destroy (hashcat_ctx_t *hashcat_ctx)
+int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 {
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  if (backend_ctx->enabled == false) return;
+  const u64 num_elements = 1024; // fixed
 
-  hcfree (backend_ctx->devices_param);
+  const u64 kernel_threads = MIN (num_elements, device_param->kernel_wgs_tm);
 
-  if (backend_ctx->ocl)
+  if (device_param->is_cuda == true)
   {
-    hcfree (backend_ctx->opencl_platforms);
-    hcfree (backend_ctx->opencl_platforms_devices);
-    hcfree (backend_ctx->opencl_platforms_devices_cnt);
-    hcfree (backend_ctx->opencl_platforms_name);
-    hcfree (backend_ctx->opencl_platforms_vendor);
-    hcfree (backend_ctx->opencl_platforms_vendor_id);
-    hcfree (backend_ctx->opencl_platforms_version);
-  }
+    CUfunction cuda_function = device_param->cuda_function_tm;
 
-  nvrtc_close (hashcat_ctx);
-  cuda_close  (hashcat_ctx);
-  ocl_close   (hashcat_ctx);
+    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements / kernel_threads, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_tm, NULL) == -1) return -1;
 
-  memset (backend_ctx, 0, sizeof (backend_ctx_t));
-}
+    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
+  }
 
-int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
-{
-  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
-  user_options_t *user_options = hashcat_ctx->user_options;
+  if (device_param->is_hip == true)
+  {
+    HIPfunction hip_function = device_param->hip_function_tm;
 
-  if (backend_ctx->enabled == false) return 0;
+    if (hc_hipLaunchKernel (hashcat_ctx, hip_function, num_elements / kernel_threads, 1, 1, kernel_threads, 1, 1, 0, device_param->hip_stream, device_param->kernel_params_tm, NULL) == -1) return -1;
 
-  hc_device_param_t *devices_param = backend_ctx->devices_param;
+    if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
+  }
 
-  bool need_adl     = false;
-  bool need_nvml    = false;
-  bool need_nvapi   = false;
-  bool need_sysfs   = false;
+  if (device_param->is_opencl == true)
+  {
+    cl_kernel cuda_kernel = device_param->opencl_kernel_tm;
 
-  int backend_devices_idx = 0;
+    const size_t global_work_size[3] = { num_elements,    1, 1 };
+    const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
 
-  int cuda_devices_cnt    = 0;
-  int cuda_devices_active = 0;
+    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, cuda_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
 
-  if (backend_ctx->cuda)
-  {
-    // device count
+    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
 
-    if (hc_cuDeviceGetCount (hashcat_ctx, &cuda_devices_cnt) == -1)
-    {
-      cuda_close (hashcat_ctx);
-    }
+    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+  }
 
-    backend_ctx->cuda_devices_cnt = cuda_devices_cnt;
+  return 0;
+}
 
-    // device specific
+int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num)
+{
+  device_param->kernel_params_amp_buf64[6] = num;
 
-    for (int cuda_devices_idx = 0; cuda_devices_idx < cuda_devices_cnt; cuda_devices_idx++, backend_devices_idx++)
-    {
-      const u32 device_id = backend_devices_idx;
+  u64 num_elements = num;
 
-      hc_device_param_t *device_param = &devices_param[backend_devices_idx];
+  const u64 kernel_threads = device_param->kernel_wgs_amp;
 
-      device_param->device_id = device_id;
-
-      backend_ctx->backend_device_from_cuda[cuda_devices_idx] = backend_devices_idx;
-
-      CUdevice cuda_device;
-
-      if (hc_cuDeviceGet (hashcat_ctx, &cuda_device, cuda_devices_idx) == -1) return -1;
-
-      device_param->cuda_device = cuda_device;
-
-      device_param->is_cuda = true;
-
-      device_param->is_opencl = false;
-
-      device_param->use_opencl12 = false;
-      device_param->use_opencl20 = false;
-      device_param->use_opencl21 = false;
-
-      // device_name
-
-      char *device_name = (char *) hcmalloc (HCBUFSIZ_TINY);
+  if (device_param->is_cuda == true)
+  {
+    num_elements = CEILDIV (num_elements, kernel_threads);
 
-      if (hc_cuDeviceGetName (hashcat_ctx, device_name, HCBUFSIZ_TINY, cuda_device) == -1) return -1;
+    CUfunction cuda_function = device_param->cuda_function_amp;
 
-      device_param->device_name = device_name;
+    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_amp, NULL) == -1) return -1;
 
-      hc_string_trim_leading (device_name);
+    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
+  }
 
-      hc_string_trim_trailing (device_name);
+  if (device_param->is_hip == true)
+  {
+    num_elements = CEILDIV (num_elements, kernel_threads);
 
-      // device_processors
+    HIPfunction hip_function = device_param->hip_function_amp;
 
-      int device_processors = 0;
+    if (hc_hipLaunchKernel (hashcat_ctx, hip_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->hip_stream, device_param->kernel_params_amp, NULL) == -1) return -1;
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_processors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_device) == -1) return -1;
+    if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
+  }
 
-      device_param->device_processors = device_processors;
+  if (device_param->is_opencl == true)
+  {
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
 
-      // device_global_mem, device_maxmem_alloc, device_available_mem
+    cl_kernel opencl_kernel = device_param->opencl_kernel_amp;
 
-      size_t bytes = 0;
+    if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 6, sizeof (cl_ulong), device_param->kernel_params_amp[6]) == -1) return -1;
 
-      if (hc_cuDeviceTotalMem (hashcat_ctx, &bytes, cuda_device) == -1) return -1;
+    const size_t global_work_size[3] = { num_elements,    1, 1 };
+    const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
 
-      device_param->device_global_mem = (u64) bytes;
+    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
 
-      device_param->device_maxmem_alloc = (u64) bytes;
+    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue)  == -1) return -1;
 
-      device_param->device_available_mem = 0;
+    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+  }
 
-      // warp size
+  return 0;
+}
 
-      int cuda_warp_size = 0;
+int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num)
+{
+  device_param->kernel_params_decompress_buf64[3] = num;
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &cuda_warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuda_device) == -1) return -1;
+  u64 num_elements = num;
 
-      device_param->cuda_warp_size = cuda_warp_size;
+  const u64 kernel_threads = device_param->kernel_wgs_decompress;
 
-      // sm_minor, sm_major
+  if (device_param->is_cuda == true)
+  {
+    num_elements = CEILDIV (num_elements, kernel_threads);
 
-      int sm_major = 0;
-      int sm_minor = 0;
+    CUfunction cuda_function = device_param->cuda_function_decompress;
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &sm_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_device) == -1) return -1;
+    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_decompress, NULL) == -1) return -1;
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &sm_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_device) == -1) return -1;
+    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
+  }
 
-      device_param->sm_major = sm_major;
-      device_param->sm_minor = sm_minor;
+  if (device_param->is_hip == true)
+  {
+    num_elements = CEILDIV (num_elements, kernel_threads);
 
-      // device_maxworkgroup_size
+    HIPfunction hip_function = device_param->hip_function_decompress;
 
-      int device_maxworkgroup_size = 0;
+    if (hc_hipLaunchKernel (hashcat_ctx, hip_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->hip_stream, device_param->kernel_params_decompress, NULL) == -1) return -1;
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_maxworkgroup_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuda_device) == -1) return -1;
+    if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
+  }
 
-      device_param->device_maxworkgroup_size = device_maxworkgroup_size;
+  if (device_param->is_opencl == true)
+  {
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
 
-      // max_clock_frequency
+    cl_kernel opencl_kernel = device_param->opencl_kernel_decompress;
 
-      int device_maxclock_frequency = 0;
+    const size_t global_work_size[3] = { num_elements,    1, 1 };
+    const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_maxclock_frequency, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cuda_device) == -1) return -1;
+    if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]) == -1) return -1;
 
-      device_param->device_maxclock_frequency = device_maxclock_frequency / 1000;
+    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
 
-      // pcie_bus, pcie_device, pcie_function
+    if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
 
-      int pci_domain_id_nv  = 0;
-      int pci_bus_id_nv     = 0;
-      int pci_slot_id_nv    = 0;
+    if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+  }
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &pci_domain_id_nv, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cuda_device) == -1) return -1;
+  return 0;
+}
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &pci_bus_id_nv, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cuda_device) == -1) return -1;
+int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt)
+{
+  combinator_ctx_t     *combinator_ctx      = hashcat_ctx->combinator_ctx;
+  hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
+  user_options_t       *user_options        = hashcat_ctx->user_options;
+  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &pci_slot_id_nv, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cuda_device) == -1) return -1;
+  // init speed timer
 
-      device_param->pcie_domain   = (u8) (pci_domain_id_nv);
-      device_param->pcie_bus      = (u8) (pci_bus_id_nv);
-      device_param->pcie_device   = (u8) (pci_slot_id_nv >> 3);
-      device_param->pcie_function = (u8) (pci_slot_id_nv & 7);
+  #if defined (_WIN)
+  if (device_param->timer_speed.QuadPart == 0)
+  {
+    hc_timer_set (&device_param->timer_speed);
+  }
+  #else
+  if (device_param->timer_speed.tv_sec == 0)
+  {
+    hc_timer_set (&device_param->timer_speed);
+  }
+  #endif
 
-      // kernel_exec_timeout
+  if (user_options->slow_candidates == true)
+  {
+    if (device_param->is_cuda == true)
+    {
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
-      int kernel_exec_timeout = 0;
+      const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &kernel_exec_timeout, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, cuda_device) == -1) return -1;
+      const u32 off = pw_idx->off;
 
-      device_param->kernel_exec_timeout = kernel_exec_timeout;
+      if (off)
+      {
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
+      }
+    }
 
-      // max_shared_memory_per_block
+    if (device_param->is_hip == true)
+    {
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
-      int max_shared_memory_per_block = 0;
+      const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &max_shared_memory_per_block, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, cuda_device) == -1) return -1;
+      const u32 off = pw_idx->off;
 
-      if (max_shared_memory_per_block < 32768)
+      if (off)
       {
-        event_log_error (hashcat_ctx, "* Device #%u: This device's shared buffer size is too small.", device_id + 1);
-
-        device_param->skipped = true;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
       }
+    }
 
-      device_param->device_local_mem_size = max_shared_memory_per_block;
-
-      // device_max_constant_buffer_size
+    if (device_param->is_opencl == true)
+    {
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
-      int device_max_constant_buffer_size = 0;
+      const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-      if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_max_constant_buffer_size, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, cuda_device) == -1) return -1;
+      const u32 off = pw_idx->off;
 
-      if (device_max_constant_buffer_size < 65536)
+      if (off)
       {
-        event_log_error (hashcat_ctx, "* Device #%u: This device's local mem size is too small.", device_id + 1);
-
-        device_param->skipped = true;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
       }
+    }
 
-      // some attributes have to be hardcoded because they are used for instance in the build options
-
-      device_param->device_local_mem_type     = CL_LOCAL;
-      device_param->opencl_device_type        = CL_DEVICE_TYPE_GPU;
-      device_param->opencl_device_vendor_id   = VENDOR_ID_NV;
-      device_param->opencl_platform_vendor_id = VENDOR_ID_NV;
+    if (run_kernel_decompress (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
+  }
+  else
+  {
+    if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+    {
+      if (device_param->is_cuda == true)
+      {
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
-      // or in the cached kernel checksum
+        const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-      device_param->opencl_device_version     = "";
-      device_param->opencl_driver_version     = "";
+        const u32 off = pw_idx->off;
 
-      // or just to make sure they are not NULL
+        if (off)
+        {
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
+        }
+      }
 
-      device_param->opencl_device_vendor     = "";
-      device_param->opencl_device_c_version  = "";
+      if (device_param->is_hip == true)
+      {
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
-      // skipped
+        const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-      if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0)
-      {
-        device_param->skipped = true;
-      }
+        const u32 off = pw_idx->off;
 
-      if ((backend_ctx->opencl_device_types_filter & CL_DEVICE_TYPE_GPU) == 0)
-      {
-        device_param->skipped = true;
+        if (off)
+        {
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
+        }
       }
 
-      if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
+      if (device_param->is_opencl == true)
       {
-        need_nvml = true;
-
-        #if defined (_WIN) || defined (__CYGWIN__)
-        need_nvapi = true;
-        #endif
-      }
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
-      // CPU burning loop damper
-      // Value is given as number between 0-100
-      // By default 8%
-      // in theory not needed with CUDA
+        const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-      device_param->spin_damp = (double) user_options->spin_damp / 100;
+        const u32 off = pw_idx->off;
 
-      // common driver check
+        if (off)
+        {
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+        }
+      }
 
-      if (device_param->skipped == false)
+      if (run_kernel_decompress (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
+    }
+    else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
       {
-        if ((user_options->force == false) && (user_options->backend_info == false))
+        if (user_options->attack_mode == ATTACK_MODE_COMBI)
         {
-          // CUDA does not support query nvidia driver version, therefore no driver checks here
-          // IF needed, could be retrieved using nvmlSystemGetDriverVersion()
-
-          if (device_param->sm_major < 5)
+          if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_RIGHT)
           {
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: This hardware has outdated CUDA compute capability (%u.%u).", device_id + 1, device_param->sm_major, device_param->sm_minor);
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             For modern OpenCL performance, upgrade to hardware that supports");
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             CUDA compute capability version 5.0 (Maxwell) or higher.");
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
+            {
+              rebuild_pws_compressed_append (device_param, pws_cnt, 0x01);
+            }
+            else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
+            {
+              rebuild_pws_compressed_append (device_param, pws_cnt, 0x06);
+            }
+            else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
+            {
+              rebuild_pws_compressed_append (device_param, pws_cnt, 0x80);
+            }
           }
-
-          if (device_param->kernel_exec_timeout != 0)
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+        {
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
           {
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: WARNING! Kernel exec timeout is not disabled.", device_id + 1);
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             This may cause \"CL_OUT_OF_RESOURCES\" or related errors.");
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             To disable the timeout, see: https://hashcat.net/q/timeoutpatch");
+            rebuild_pws_compressed_append (device_param, pws_cnt, 0x01);
+          }
+          else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
+          {
+            rebuild_pws_compressed_append (device_param, pws_cnt, 0x06);
+          }
+          else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
+          {
+            rebuild_pws_compressed_append (device_param, pws_cnt, 0x80);
           }
         }
 
-        /**
-         * activate device
-         */
+        if (device_param->is_cuda == true)
+        {
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
-        cuda_devices_active++;
-      }
+          const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-      CUcontext cuda_context;
+          const u32 off = pw_idx->off;
 
-      if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
+          if (off)
+          {
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
+          }
+        }
 
-      if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1;
+        if (device_param->is_hip == true)
+        {
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
-      // bcrypt optimization?
-      //const int rc_cuCtxSetCacheConfig = hc_cuCtxSetCacheConfig (hashcat_ctx, CU_FUNC_CACHE_PREFER_SHARED);
-      //
-      //if (rc_cuCtxSetCacheConfig == -1) return -1;
+          const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-      const int sm = (device_param->sm_major * 10) + device_param->sm_minor;
+          const u32 off = pw_idx->off;
 
-      device_param->has_add   = (sm >= 12) ? true : false;
-      device_param->has_addc  = (sm >= 12) ? true : false;
-      device_param->has_sub   = (sm >= 12) ? true : false;
-      device_param->has_subc  = (sm >= 12) ? true : false;
-      device_param->has_bfe   = (sm >= 20) ? true : false;
-      device_param->has_lop3  = (sm >= 50) ? true : false;
-      device_param->has_mov64 = (sm >= 10) ? true : false;
-      device_param->has_prmt  = (sm >= 20) ? true : false;
+          if (off)
+          {
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
+          }
+        }
 
-      /*
-      #define RUN_INSTRUCTION_CHECKS()                                                                                                                                                                                                                      \
-        device_param->has_add   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                              \
-        device_param->has_addc  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                             \
-        device_param->has_sub   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                              \
-        device_param->has_subc  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                             \
-        device_param->has_bfe   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                              \
-        device_param->has_lop3  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");                                                          \
-        device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");  \
-        device_param->has_prmt  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                             \
+        if (device_param->is_opencl == true)
+        {
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
-      if (backend_devices_idx > 0)
-      {
-        hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
+          const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-        if (is_same_device_type (device_param, device_param_prev) == true)
-        {
-          device_param->has_add   = device_param_prev->has_add;
-          device_param->has_addc  = device_param_prev->has_addc;
-          device_param->has_sub   = device_param_prev->has_sub;
-          device_param->has_subc  = device_param_prev->has_subc;
-          device_param->has_bfe   = device_param_prev->has_bfe;
-          device_param->has_lop3  = device_param_prev->has_lop3;
-          device_param->has_mov64 = device_param_prev->has_mov64;
-          device_param->has_prmt  = device_param_prev->has_prmt;
-        }
-        else
-        {
-          RUN_INSTRUCTION_CHECKS();
+          const u32 off = pw_idx->off;
+
+          if (off)
+          {
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+          }
         }
+
+        if (run_kernel_decompress (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
       }
       else
       {
-        RUN_INSTRUCTION_CHECKS();
-      }
-
-      #undef RUN_INSTRUCTION_CHECKS
-      */
+        if (user_options->attack_mode == ATTACK_MODE_COMBI)
+        {
+          if (device_param->is_cuda == true)
+          {
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
-      // device_available_mem
+            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-      size_t free  = 0;
-      size_t total = 0;
+            const u32 off = pw_idx->off;
 
-      if (hc_cuMemGetInfo (hashcat_ctx, &free, &total) == -1) return -1;
+            if (off)
+            {
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
+            }
+          }
 
-      device_param->device_available_mem = (u64) free;
+          if (device_param->is_hip == true)
+          {
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
-      if (hc_cuCtxDestroy (hashcat_ctx, cuda_context) == -1) return -1;
-    }
-  }
+            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-  backend_ctx->cuda_devices_cnt     = cuda_devices_cnt;
-  backend_ctx->cuda_devices_active  = cuda_devices_active;
+            const u32 off = pw_idx->off;
 
-  int opencl_devices_cnt    = 0;
-  int opencl_devices_active = 0;
+            if (off)
+            {
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
+            }
+          }
 
-  if (backend_ctx->ocl)
-  {
-    /**
-     * OpenCL devices: simply push all devices from all platforms into the same device array
-     */
+          if (device_param->is_opencl == true)
+          {
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
-    cl_uint         opencl_platforms_cnt         = backend_ctx->opencl_platforms_cnt;
-    cl_device_id  **opencl_platforms_devices     = backend_ctx->opencl_platforms_devices;
-    cl_uint        *opencl_platforms_devices_cnt = backend_ctx->opencl_platforms_devices_cnt;
-    cl_uint        *opencl_platforms_vendor_id   = backend_ctx->opencl_platforms_vendor_id;
-    char          **opencl_platforms_version     = backend_ctx->opencl_platforms_version;
+            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-    for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
-    {
-      cl_device_id   *opencl_platform_devices     = opencl_platforms_devices[opencl_platforms_idx];
-      cl_uint         opencl_platform_devices_cnt = opencl_platforms_devices_cnt[opencl_platforms_idx];
-      cl_uint         opencl_platform_vendor_id   = opencl_platforms_vendor_id[opencl_platforms_idx];
-      char           *opencl_platform_version     = opencl_platforms_version[opencl_platforms_idx];
+            const u32 off = pw_idx->off;
 
-      for (u32 opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++, backend_devices_idx++, opencl_devices_cnt++)
-      {
-        const u32 device_id = backend_devices_idx;
+            if (off)
+            {
+              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+            }
+          }
 
-        hc_device_param_t *device_param = &devices_param[device_id];
+          if (run_kernel_decompress (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+        {
+          if (device_param->is_cuda == true)
+          {
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
-        device_param->device_id = device_id;
+            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-        backend_ctx->backend_device_from_opencl[opencl_devices_cnt] = backend_devices_idx;
+            const u32 off = pw_idx->off;
 
-        backend_ctx->backend_device_from_opencl_platform[opencl_platforms_idx][opencl_platform_devices_idx] = backend_devices_idx;
+            if (off)
+            {
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
+            }
+          }
 
-        device_param->opencl_platform_vendor_id = opencl_platform_vendor_id;
+          if (device_param->is_hip == true)
+          {
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
-        device_param->opencl_device = opencl_platform_devices[opencl_platform_devices_idx];
+            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-        //device_param->opencl_platform = opencl_platform;
+            const u32 off = pw_idx->off;
 
-        device_param->is_cuda = false;
+            if (off)
+            {
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
+            }
+          }
 
-        device_param->is_opencl = true;
+          if (device_param->is_opencl == true)
+          {
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
-        // store opencl platform i
+            const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
-        device_param->opencl_platform_id = opencl_platforms_idx;
+            const u32 off = pw_idx->off;
 
-        // check OpenCL version
+            if (off)
+            {
+              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+            }
+          }
 
-        device_param->use_opencl12 = false;
-        device_param->use_opencl20 = false;
-        device_param->use_opencl21 = false;
+          if (run_kernel_decompress (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+        {
+          const u64 off = device_param->words_off;
 
-        int opencl_version_min = 0;
-        int opencl_version_maj = 0;
+          device_param->kernel_params_mp_buf64[3] = off;
 
-        if (sscanf (opencl_platform_version, "OpenCL %d.%d", &opencl_version_min, &opencl_version_maj) == 2)
-        {
-          if ((opencl_version_min == 1) && (opencl_version_maj == 2))
-          {
-            device_param->use_opencl12 = true;
-          }
-          else if ((opencl_version_min == 2) && (opencl_version_maj == 0))
-          {
-            device_param->use_opencl20 = true;
-          }
-          else if ((opencl_version_min == 2) && (opencl_version_maj == 1))
-          {
-            device_param->use_opencl21 = true;
-          }
+          if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, pws_cnt) == -1) return -1;
         }
+      }
+    }
+    else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+    {
+      const u64 off = device_param->words_off;
 
-        size_t param_value_size = 0;
+      device_param->kernel_params_mp_l_buf64[3] = off;
 
-        // opencl_device_type
+      if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP_L, pws_cnt) == -1) return -1;
+    }
+  }
 
-        cl_device_type opencl_device_type;
+  return 0;
+}
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_TYPE, sizeof (opencl_device_type), &opencl_device_type, NULL) == -1) return -1;
+int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt)
+{
+  combinator_ctx_t      *combinator_ctx     = hashcat_ctx->combinator_ctx;
+  hashconfig_t          *hashconfig         = hashcat_ctx->hashconfig;
+  hashes_t              *hashes             = hashcat_ctx->hashes;
+  mask_ctx_t            *mask_ctx           = hashcat_ctx->mask_ctx;
+  status_ctx_t          *status_ctx         = hashcat_ctx->status_ctx;
+  straight_ctx_t        *straight_ctx       = hashcat_ctx->straight_ctx;
+  user_options_t        *user_options       = hashcat_ctx->user_options;
+  user_options_extra_t  *user_options_extra = hashcat_ctx->user_options_extra;
 
-        opencl_device_type &= ~CL_DEVICE_TYPE_DEFAULT;
+  // do the on-the-fly combinator mode encoding
 
-        device_param->opencl_device_type = opencl_device_type;
+  bool iconv_enabled = false;
 
-        // device_name
+  iconv_t iconv_ctx = NULL;
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NAME, 0, NULL, &param_value_size) == -1) return -1;
+  char *iconv_tmp = NULL;
 
-        char *device_name = (char *) hcmalloc (param_value_size);
+  if (strcmp (user_options->encoding_from, user_options->encoding_to) != 0)
+  {
+    iconv_enabled = true;
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NAME, param_value_size, device_name, NULL) == -1) return -1;
+    iconv_ctx = iconv_open (user_options->encoding_to, user_options->encoding_from);
 
-        device_param->device_name = device_name;
+    if (iconv_ctx == (iconv_t) -1) return -1;
 
-        hc_string_trim_leading (device_param->device_name);
+    iconv_tmp = (char *) hcmalloc (HCBUFSIZ_TINY);
+  }
 
-        hc_string_trim_trailing (device_param->device_name);
+  // find higest password length, this is for optimization stuff
 
-        // device_vendor
+  u32 highest_pw_len = 0;
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VENDOR, 0, NULL, &param_value_size) == -1) return -1;
+  if (user_options->slow_candidates == true)
+  {
+    /*
+    for (u64 pws_idx = 0; pws_idx < pws_cnt; pws_idx++)
+    {
+      pw_idx_t *pw_idx = device_param->pws_idx + pws_idx;
 
-        char *opencl_device_vendor = (char *) hcmalloc (param_value_size);
+      highest_pw_len = MAX (highest_pw_len, pw_idx->len);
+    }
+    */
+  }
+  else
+  {
+    if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+    {
+    }
+    else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+    {
+    }
+    else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+    {
+      highest_pw_len = device_param->kernel_params_mp_l_buf32[4]
+                     + device_param->kernel_params_mp_l_buf32[5];
+    }
+  }
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VENDOR, param_value_size, opencl_device_vendor, NULL) == -1) return -1;
+  // we make use of this in status view
 
-        device_param->opencl_device_vendor = opencl_device_vendor;
+  device_param->outerloop_multi = 1;
+  device_param->outerloop_msec  = 0;
+  device_param->outerloop_pos   = 0;
+  device_param->outerloop_left  = pws_cnt;
 
-        cl_uint opencl_device_vendor_id = 0;
+  // we ignore the time to copy data over pci bus in this case
 
-        if (strcmp (opencl_device_vendor, CL_VENDOR_AMD1) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_AMD;
-        }
-        else if (strcmp (opencl_device_vendor, CL_VENDOR_AMD2) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_AMD;
-        }
-        else if (strcmp (opencl_device_vendor, CL_VENDOR_AMD_USE_INTEL) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_AMD_USE_INTEL;
-        }
-        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_APPLE;
-        }
-        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE_USE_AMD) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_AMD;
-        }
-        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE_USE_NV) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_NV;
-        }
-        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE_USE_INTEL) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_INTEL_SDK;
-        }
-        else if (strcmp (opencl_device_vendor, CL_VENDOR_INTEL_BEIGNET) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_INTEL_BEIGNET;
-        }
-        else if (strcmp (opencl_device_vendor, CL_VENDOR_INTEL_SDK) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_INTEL_SDK;
-        }
-        else if (strcmp (opencl_device_vendor, CL_VENDOR_MESA) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_MESA;
-        }
-        else if (strcmp (opencl_device_vendor, CL_VENDOR_NV) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_NV;
-        }
-        else if (strcmp (opencl_device_vendor, CL_VENDOR_POCL) == 0)
-        {
-          opencl_device_vendor_id = VENDOR_ID_POCL;
-        }
-        else
-        {
-          opencl_device_vendor_id = VENDOR_ID_GENERIC;
-        }
+  if (user_options->speed_only == true)
+  {
+    hc_timer_set (&device_param->timer_speed);
+  }
 
-        device_param->opencl_device_vendor_id = opencl_device_vendor_id;
+  // loop start: most outer loop = salt iteration, then innerloops (if multi)
 
-        // device_version
+  for (u32 salt_pos = 0; salt_pos < hashes->salts_cnt; salt_pos++)
+  {
+    while (status_ctx->devices_status == STATUS_PAUSED) sleep (1);
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VERSION, 0, NULL, &param_value_size) == -1) return -1;
+    salt_t *salt_buf = &hashes->salts_buf[salt_pos];
 
-        char *opencl_device_version = (char *) hcmalloc (param_value_size);
+    device_param->kernel_params_buf32[27] = salt_pos;
+    device_param->kernel_params_buf32[31] = salt_buf->digests_cnt;
+    device_param->kernel_params_buf32[32] = salt_buf->digests_offset;
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VERSION, param_value_size, opencl_device_version, NULL) == -1) return -1;
+    HCFILE *combs_fp = &device_param->combs_fp;
 
-        device_param->opencl_device_version = opencl_device_version;
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      if ((user_options->attack_mode == ATTACK_MODE_COMBI) || (((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0) && (user_options->attack_mode == ATTACK_MODE_HYBRID2)))
+      {
+        hc_rewind (combs_fp);
+      }
+    }
 
-        // opencl_device_c_version
+    // iteration type
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &param_value_size) == -1) return -1;
+    u32 innerloop_step = 0;
+    u32 innerloop_cnt  = 0;
 
-        char *opencl_device_c_version = (char *) hcmalloc (param_value_size);
+    if (user_options->slow_candidates == true)
+    {
+      innerloop_step = 1;
+      innerloop_cnt  = 1;
+    }
+    else
+    {
+      if   (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) innerloop_step = device_param->kernel_loops;
+      else                                                        innerloop_step = 1;
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_OPENCL_C_VERSION, param_value_size, opencl_device_c_version, NULL) == -1) return -1;
+      if      (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)  innerloop_cnt = straight_ctx->kernel_rules_cnt;
+      else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)     innerloop_cnt = (u32) combinator_ctx->combs_cnt;
+      else if (user_options_extra->attack_kern == ATTACK_KERN_BF)        innerloop_cnt = (u32) mask_ctx->bfs_cnt;
+    }
 
-        device_param->opencl_device_c_version = opencl_device_c_version;
+    // innerloops
 
-        // max_compute_units
+    for (u32 innerloop_pos = 0; innerloop_pos < innerloop_cnt; innerloop_pos += innerloop_step)
+    {
+      while (status_ctx->devices_status == STATUS_PAUSED) sleep (1);
 
-        cl_uint device_processors = 0;
+      u32 fast_iteration = 0;
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (device_processors), &device_processors, NULL) == -1) return -1;
+      u32 innerloop_left = innerloop_cnt - innerloop_pos;
 
-        device_param->device_processors = device_processors;
+      if (innerloop_left > innerloop_step)
+      {
+        innerloop_left = innerloop_step;
 
-        // device_global_mem
+        fast_iteration = 1;
+      }
 
-        cl_ulong device_global_mem = 0;
+      hc_thread_mutex_lock (status_ctx->mux_display);
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof (device_global_mem), &device_global_mem, NULL) == -1) return -1;
+      device_param->innerloop_pos  = innerloop_pos;
+      device_param->innerloop_left = innerloop_left;
 
-        device_param->device_global_mem = device_global_mem;
+      device_param->kernel_params_buf32[30] = innerloop_left;
 
-        device_param->device_available_mem = 0;
+      device_param->outerloop_multi = (double) innerloop_cnt / (double) (innerloop_pos + innerloop_left);
 
-        // device_maxmem_alloc
+      hc_thread_mutex_unlock (status_ctx->mux_display);
 
-        cl_ulong device_maxmem_alloc = 0;
+      if (hashes->salts_shown[salt_pos] == 1)
+      {
+        status_ctx->words_progress_done[salt_pos] += pws_cnt * innerloop_left;
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof (device_maxmem_alloc), &device_maxmem_alloc, NULL) == -1) return -1;
+        continue;
+      }
 
-        device_param->device_maxmem_alloc = device_maxmem_alloc;
+      // initialize and copy amplifiers
 
-        // note we'll limit to 2gb, otherwise this causes all kinds of weird errors because of possible integer overflows in opencl runtimes
-        // testwise disabling that
-        //device_param->device_maxmem_alloc = MIN (device_maxmem_alloc, 0x7fffffff);
+      if (user_options->slow_candidates == true)
+      {
+      }
+      else
+      {
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          if (device_param->is_cuda == true)
+          {
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t)) == -1) return -1;
+          }
 
-        // max_work_group_size
+          if (device_param->is_hip == true)
+          {
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t)) == -1) return -1;
+          }
 
-        size_t device_maxworkgroup_size = 0;
+          if (device_param->is_opencl == true)
+          {
+            if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, device_param->opencl_d_rules_c, innerloop_pos * sizeof (kernel_rule_t), 0, innerloop_left * sizeof (kernel_rule_t), 0, NULL, NULL) == -1) return -1;
+          }
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+          {
+            if (user_options->attack_mode == ATTACK_MODE_COMBI)
+            {
+              char *line_buf = device_param->scratch_buf;
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (device_maxworkgroup_size), &device_maxworkgroup_size, NULL) == -1) return -1;
+              u32 i = 0;
 
-        device_param->device_maxworkgroup_size = device_maxworkgroup_size;
+              while (i < innerloop_left)
+              {
+                if (hc_feof (combs_fp)) break;
 
-        // max_clock_frequency
+                size_t line_len = fgetl (combs_fp, line_buf, HCBUFSIZ_LARGE);
 
-        cl_uint device_maxclock_frequency = 0;
+                line_len = convert_from_hex (hashcat_ctx, line_buf, line_len);
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof (device_maxclock_frequency), &device_maxclock_frequency, NULL) == -1) return -1;
+                if (line_len > PW_MAX) continue;
 
-        device_param->device_maxclock_frequency = device_maxclock_frequency;
+                char *line_buf_new = line_buf;
 
-        // device_endian_little
+                char rule_buf_out[RP_PASSWORD_SIZE];
 
-        cl_bool device_endian_little = CL_FALSE;
-
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_ENDIAN_LITTLE, sizeof (device_endian_little), &device_endian_little, NULL) == -1) return -1;
+                if (run_rule_engine (user_options_extra->rule_len_r, user_options->rule_buf_r))
+                {
+                  if (line_len >= RP_PASSWORD_SIZE) continue;
 
-        if (device_endian_little == CL_FALSE)
-        {
-          event_log_error (hashcat_ctx, "* Device #%u: This device is not little-endian.", device_id + 1);
+                  memset (rule_buf_out, 0, sizeof (rule_buf_out));
 
-          device_param->skipped = true;
-        }
+                  const int rule_len_out = _old_apply_rule (user_options->rule_buf_r, user_options_extra->rule_len_r, line_buf, (u32) line_len, rule_buf_out);
 
-        // device_available
+                  if (rule_len_out < 0)
+                  {
+                    status_ctx->words_progress_rejected[salt_pos] += pws_cnt;
 
-        cl_bool device_available = CL_FALSE;
+                    continue;
+                  }
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_AVAILABLE, sizeof (device_available), &device_available, NULL) == -1) return -1;
+                  line_len = rule_len_out;
 
-        if (device_available == CL_FALSE)
-        {
-          event_log_error (hashcat_ctx, "* Device #%u: This device is not available.", device_id + 1);
+                  line_buf_new = rule_buf_out;
+                }
 
-          device_param->skipped = true;
-        }
+                // do the on-the-fly encoding
 
-        // device_compiler_available
+                if (iconv_enabled == true)
+                {
+                  char  *iconv_ptr = iconv_tmp;
+                  size_t iconv_sz  = HCBUFSIZ_TINY;
 
-        cl_bool device_compiler_available = CL_FALSE;
+                  if (iconv (iconv_ctx, &line_buf_new, &line_len, &iconv_ptr, &iconv_sz) == (size_t) -1) continue;
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_COMPILER_AVAILABLE, sizeof (device_compiler_available), &device_compiler_available, NULL) == -1) return -1;
+                  line_buf_new = iconv_tmp;
+                  line_len     = HCBUFSIZ_TINY - iconv_sz;
+                }
 
-        if (device_compiler_available == CL_FALSE)
-        {
-          event_log_error (hashcat_ctx, "* Device #%u: No compiler is available for this device.", device_id + 1);
+                line_len = MIN (line_len, PW_MAX);
 
-          device_param->skipped = true;
-        }
+                u8 *ptr = (u8 *) device_param->combs_buf[i].i;
 
-        // device_execution_capabilities
+                memcpy (ptr, line_buf_new, line_len);
 
-        cl_device_exec_capabilities device_execution_capabilities;
+                memset (ptr + line_len, 0, PW_MAX - line_len);
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof (device_execution_capabilities), &device_execution_capabilities, NULL) == -1) return -1;
+                if (hashconfig->opts_type & OPTS_TYPE_PT_UPPER)
+                {
+                  uppercase (ptr, line_len);
+                }
 
-        if ((device_execution_capabilities & CL_EXEC_KERNEL) == 0)
-        {
-          event_log_error (hashcat_ctx, "* Device #%u: This device does not support executing kernels.", device_id + 1);
+                if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_LEFT)
+                {
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
+                  {
+                    ptr[line_len] = 0x80;
+                  }
 
-          device_param->skipped = true;
-        }
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
+                  {
+                    ptr[line_len] = 0x06;
+                  }
 
-        // device_extensions
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
+                  {
+                    ptr[line_len] = 0x01;
+                  }
+                }
 
-        size_t device_extensions_size;
+                device_param->combs_buf[i].pw_len = (u32) line_len;
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_EXTENSIONS, 0, NULL, &device_extensions_size) == -1) return -1;
+                i++;
+              }
 
-        char *device_extensions = (char *) hcmalloc (device_extensions_size + 1);
+              for (u32 j = i; j < innerloop_left; j++)
+              {
+                memset (&device_param->combs_buf[j], 0, sizeof (pw_t));
+              }
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_EXTENSIONS, device_extensions_size, device_extensions, NULL) == -1) return -1;
+              innerloop_left = i;
 
-        if (strstr (device_extensions, "base_atomics") == 0)
-        {
-          event_log_error (hashcat_ctx, "* Device #%u: This device does not support base atomics.", device_id + 1);
+              if (device_param->is_cuda == true)
+              {
+                if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
+              }
 
-          device_param->skipped = true;
-        }
+              if (device_param->is_hip == true)
+              {
+                if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
+              }
 
-        if (strstr (device_extensions, "byte_addressable_store") == 0)
-        {
-          event_log_error (hashcat_ctx, "* Device #%u: This device does not support byte-addressable store.", device_id + 1);
+              if (device_param->is_opencl == true)
+              {
+                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
+              }
+            }
+            else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+            {
+              u64 off = innerloop_pos;
 
-          device_param->skipped = true;
-        }
+              device_param->kernel_params_mp_buf64[3] = off;
 
-        hcfree (device_extensions);
+              if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left) == -1) return -1;
 
-        // device_local_mem_type
+              if (device_param->is_cuda == true)
+              {
+                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
+              }
 
-        cl_device_local_mem_type device_local_mem_type;
+              if (device_param->is_hip == true)
+              {
+                if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
+              }
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof (device_local_mem_type), &device_local_mem_type, NULL) == -1) return -1;
+              if (device_param->is_opencl == true)
+              {
+                if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs, device_param->opencl_d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL) == -1) return -1;
+              }
+            }
+            else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+            {
+              u64 off = innerloop_pos;
 
-        device_param->device_local_mem_type = device_local_mem_type;
+              device_param->kernel_params_mp_buf64[3] = off;
 
-        // device_max_constant_buffer_size
+              if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left) == -1) return -1;
 
-        cl_ulong device_max_constant_buffer_size;
+              if (device_param->is_cuda == true)
+              {
+                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
+              }
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof (device_max_constant_buffer_size), &device_max_constant_buffer_size, NULL) == -1) return -1;
+              if (device_param->is_hip == true)
+              {
+                if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
+              }
 
-        if (device_local_mem_type == CL_LOCAL)
-        {
-          if (device_max_constant_buffer_size < 65536)
+              if (device_param->is_opencl == true)
+              {
+                if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs, device_param->opencl_d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL) == -1) return -1;
+              }
+            }
+          }
+          else
           {
-            event_log_error (hashcat_ctx, "* Device #%u: This device's constant buffer size is too small.", device_id + 1);
+            if ((user_options->attack_mode == ATTACK_MODE_COMBI) || (user_options->attack_mode == ATTACK_MODE_HYBRID2))
+            {
+              char *line_buf = device_param->scratch_buf;
 
-            device_param->skipped = true;
-          }
-        }
+              u32 i = 0;
 
-        // device_local_mem_size
+              while (i < innerloop_left)
+              {
+                if (hc_feof (combs_fp)) break;
 
-        cl_ulong device_local_mem_size = 0;
+                size_t line_len = fgetl (combs_fp, line_buf, HCBUFSIZ_LARGE);
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (device_local_mem_size), &device_local_mem_size, NULL) == -1) return -1;
+                line_len = convert_from_hex (hashcat_ctx, line_buf, line_len);
 
-        if (device_local_mem_type == CL_LOCAL)
-        {
-          if (device_local_mem_size < 32768)
-          {
-            event_log_error (hashcat_ctx, "* Device #%u: This device's local mem size is too small.", device_id + 1);
+                if (line_len > PW_MAX) continue;
 
-            device_param->skipped = true;
-          }
-        }
+                char *line_buf_new = line_buf;
 
-        device_param->device_local_mem_size = device_local_mem_size;
+                char rule_buf_out[RP_PASSWORD_SIZE];
 
-        // older POCL version and older LLVM versions are known to fail compiling kernels
-        // we need to inform the user to update
-        // https://github.com/hashcat/hashcat/issues/2344
+                if (run_rule_engine (user_options_extra->rule_len_r, user_options->rule_buf_r))
+                {
+                  if (line_len >= RP_PASSWORD_SIZE) continue;
 
-        if (opencl_platform_vendor_id == VENDOR_ID_POCL)
-        {
-          char *pocl_version_ptr = strstr (opencl_platform_version, "pocl ");
-          char *llvm_version_ptr = strstr (opencl_platform_version, "LLVM ");
+                  memset (rule_buf_out, 0, sizeof (rule_buf_out));
 
-          if ((pocl_version_ptr != NULL) && (llvm_version_ptr != NULL))
-          {
-            bool pocl_skip = false;
+                  const int rule_len_out = _old_apply_rule (user_options->rule_buf_r, user_options_extra->rule_len_r, line_buf, (u32) line_len, rule_buf_out);
 
-            int pocl_maj = 0;
-            int pocl_min = 0;
+                  if (rule_len_out < 0)
+                  {
+                    status_ctx->words_progress_rejected[salt_pos] += pws_cnt;
 
-            const int res1 = sscanf (pocl_version_ptr, "pocl %d.%d", &pocl_maj, &pocl_min);
+                    continue;
+                  }
 
-            if (res1 == 2)
-            {
-              const int pocl_version = (pocl_maj * 100) + pocl_min;
+                  line_len = rule_len_out;
 
-              if (pocl_version < 105)
-              {
-                pocl_skip = true;
-              }
-            }
+                  line_buf_new = rule_buf_out;
+                }
 
-            int llvm_maj = 0;
-            int llvm_min = 0;
+                // do the on-the-fly encoding
 
-            const int res2 = sscanf (llvm_version_ptr, "LLVM %d.%d", &llvm_maj, &llvm_min);
+                if (iconv_enabled == true)
+                {
+                  char  *iconv_ptr = iconv_tmp;
+                  size_t iconv_sz  = HCBUFSIZ_TINY;
 
-            if (res2 == 2)
-            {
-              const int llvm_version = (llvm_maj * 100) + llvm_min;
+                  if (iconv (iconv_ctx, &line_buf_new, &line_len, &iconv_ptr, &iconv_sz) == (size_t) -1) continue;
 
-              if (llvm_version < 900)
-              {
-                pocl_skip = true;
-              }
-            }
+                  line_buf_new = iconv_tmp;
+                  line_len     = HCBUFSIZ_TINY - iconv_sz;
+                }
 
-            if (pocl_skip == true)
-            {
-              if (user_options->force == false)
-              {
-                event_log_error (hashcat_ctx, "* Device #%u: Outdated POCL OpenCL driver detected!", device_id + 1);
+                line_len = MIN (line_len, PW_MAX);
 
-                if (user_options->quiet == false) event_log_warning (hashcat_ctx, "This OpenCL driver has been marked as likely to fail kernel compilation or to produce false negatives.");
-                if (user_options->quiet == false) event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
-                if (user_options->quiet == false) event_log_warning (hashcat_ctx, NULL);
+                u8 *ptr = (u8 *) device_param->combs_buf[i].i;
 
-                device_param->skipped = true;
-              }
-            }
-          }
-        }
+                memcpy (ptr, line_buf_new, line_len);
 
-        char *opencl_device_version_lower = hcstrdup (opencl_device_version);
+                memset (ptr + line_len, 0, PW_MAX - line_len);
 
-        lowercase ((u8 *) opencl_device_version_lower, strlen (opencl_device_version_lower));
+                if (hashconfig->opts_type & OPTS_TYPE_PT_UPPER)
+                {
+                  uppercase (ptr, line_len);
+                }
 
-        if ((strstr (opencl_device_version_lower, "neo "))
-         || (strstr (opencl_device_version_lower, " neo"))
-         || (strstr (opencl_device_version_lower, "beignet "))
-         || (strstr (opencl_device_version_lower, " beignet"))
-         || (strstr (opencl_device_version_lower, "mesa "))
-         || (strstr (opencl_device_version_lower, " mesa")))
-        {
-          // NEO:     https://github.com/hashcat/hashcat/issues/2342
-          // BEIGNET: https://github.com/hashcat/hashcat/issues/2243
-          // MESA:    https://github.com/hashcat/hashcat/issues/2269
+                /*
+                if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_LEFT)
+                {
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)
+                  {
+                    ptr[line_len] = 0x80;
+                  }
 
-          if (user_options->force == false)
-          {
-            event_log_error (hashcat_ctx, "* Device #%u: Unstable OpenCL driver detected!", device_id + 1);
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)
+                  {
+                    ptr[line_len] = 0x06;
+                  }
 
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "This OpenCL driver has been marked as likely to fail kernel compilation or to produce false negatives.");
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
-            if (user_options->quiet == false) event_log_warning (hashcat_ctx, NULL);
+                  if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)
+                  {
+                    ptr[line_len] = 0x01;
+                  }
+                }
+                */
 
-            device_param->skipped = true;
-          }
-        }
+                device_param->combs_buf[i].pw_len = (u32) line_len;
 
-        hcfree (opencl_device_version_lower);
+                i++;
+              }
 
-        // Since some times we get reports from users about not working hashcat, dropping error messages like:
-        // CL_INVALID_COMMAND_QUEUE and CL_OUT_OF_RESOURCES
-        // Turns out that this is caused by Intel OpenCL runtime handling their GPU devices
-        // Disable such devices unless the user forces to use it
-        // This is successfully workaround with new threading model and new memory management
-        // Tested on Windows 10
-        // OpenCL.Version.: OpenCL C 2.1
-        // Driver.Version.: 23.20.16.4973
+              for (u32 j = i; j < innerloop_left; j++)
+              {
+                memset (&device_param->combs_buf[j], 0, sizeof (pw_t));
+              }
 
-        /*
-        #if !defined (__APPLE__)
-        if (opencl_device_type & CL_DEVICE_TYPE_GPU)
-        {
-          if ((device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK) || (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_BEIGNET))
-          {
-            if (user_options->force == false)
-            {
-              if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Intel's OpenCL runtime (GPU only) is currently broken.", device_id + 1);
-              if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             We are waiting for updated OpenCL drivers from Intel.");
-              if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             You can use --force to override, but do not report related errors.");
+              innerloop_left = i;
 
-              device_param->skipped = true;
-            }
-          }
-        }
-        #endif // __APPLE__
-        */
+              if (device_param->is_cuda == true)
+              {
+                if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
+              }
 
-        // skipped
+              if (device_param->is_hip == true)
+              {
+                if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
+              }
 
-        if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0)
-        {
-          device_param->skipped = true;
-        }
+              if (device_param->is_opencl == true)
+              {
+                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
+              }
+            }
+            else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+            {
+              u64 off = innerloop_pos;
 
-        if ((backend_ctx->opencl_device_types_filter & (opencl_device_type)) == 0)
-        {
-          device_param->skipped = true;
-        }
+              device_param->kernel_params_mp_buf64[3] = off;
 
-        // driver_version
+              if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP, innerloop_left) == -1) return -1;
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DRIVER_VERSION, 0, NULL, &param_value_size) == -1) return -1;
+              if (device_param->is_cuda == true)
+              {
+                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
+              }
 
-        char *opencl_driver_version = (char *) hcmalloc (param_value_size);
+              if (device_param->is_hip == true)
+              {
+                if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
+              }
 
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DRIVER_VERSION, param_value_size, opencl_driver_version, NULL) == -1) return -1;
+              if (device_param->is_opencl == true)
+              {
+                if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs, device_param->opencl_d_combs_c, 0, 0, innerloop_left * sizeof (pw_t), 0, NULL, NULL) == -1) return -1;
+              }
+            }
+          }
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          u64 off = innerloop_pos;
 
-        device_param->opencl_driver_version = opencl_driver_version;
+          device_param->kernel_params_mp_r_buf64[3] = off;
 
-        // vendor specific
+          if (run_kernel_mp (hashcat_ctx, device_param, KERN_RUN_MP_R, innerloop_left) == -1) return -1;
 
-        if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-        {
-          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
+          if (device_param->is_cuda == true)
           {
-            need_adl = true;
-
-            #if defined (__linux__)
-            need_sysfs = true;
-            #endif
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_bfs, innerloop_left * sizeof (bf_t)) == -1) return -1;
           }
 
-          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
+          if (device_param->is_hip == true)
           {
-            need_nvml = true;
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_bfs_c, device_param->hip_d_bfs, innerloop_left * sizeof (bf_t)) == -1) return -1;
+          }
 
-            #if defined (_WIN) || defined (__CYGWIN__)
-            need_nvapi = true;
-            #endif
+          if (device_param->is_opencl == true)
+          {
+            if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bfs, device_param->opencl_d_bfs_c, 0, 0, innerloop_left * sizeof (bf_t), 0, NULL, NULL) == -1) return -1;
           }
         }
+      }
 
-        if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-        {
-          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
-          {
-            cl_device_topology_amd amdtopo;
+      if (choose_kernel (hashcat_ctx, device_param, highest_pw_len, pws_cnt, fast_iteration, salt_pos) == -1) return -1;
 
-            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_TOPOLOGY_AMD, sizeof (amdtopo), &amdtopo, NULL) == -1) return -1;
+      /**
+       * benchmark was aborted because too long kernel runtime (slow hashes only)
+       */
 
-            device_param->pcie_domain   = 0; // no attribute to query
-            device_param->pcie_bus      = amdtopo.pcie.bus;
-            device_param->pcie_device   = amdtopo.pcie.device;
-            device_param->pcie_function = amdtopo.pcie.function;
-          }
+      if ((user_options->speed_only == true) && (device_param->speed_only_finish == true))
+      {
+        // nothing to do in that case
+      }
+      else
+      {
+        /**
+         * speed
+         */
 
-          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
-          {
-            cl_uint pci_bus_id_nv;  // is cl_uint the right type for them??
-            cl_uint pci_slot_id_nv;
+        if (status_ctx->run_thread_level2 == true)
+        {
+          const u64 perf_sum_all = pws_cnt * innerloop_left;
 
-            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_PCI_BUS_ID_NV, sizeof (pci_bus_id_nv), &pci_bus_id_nv, NULL) == -1) return -1;
+          const double speed_msec = hc_timer_get (device_param->timer_speed);
 
-            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_PCI_SLOT_ID_NV, sizeof (pci_slot_id_nv), &pci_slot_id_nv, NULL) == -1) return -1;
+          hc_timer_set (&device_param->timer_speed);
 
-            device_param->pcie_domain   = 0; // no attribute to query
-            device_param->pcie_bus      = (u8) (pci_bus_id_nv);
-            device_param->pcie_device   = (u8) (pci_slot_id_nv >> 3);
-            device_param->pcie_function = (u8) (pci_slot_id_nv & 7);
+          u32 speed_pos = device_param->speed_pos;
 
-            int sm_minor = 0;
-            int sm_major = 0;
+          device_param->speed_cnt[speed_pos] = perf_sum_all;
 
-            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof (sm_minor), &sm_minor, NULL) == -1) return -1;
+          device_param->speed_msec[speed_pos] = speed_msec;
 
-            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof (sm_major), &sm_major, NULL) == -1) return -1;
+          speed_pos++;
 
-            device_param->sm_minor = sm_minor;
-            device_param->sm_major = sm_major;
+          if (speed_pos == SPEED_CACHE)
+          {
+            speed_pos = 0;
+          }
 
-            cl_uint kernel_exec_timeout = 0;
+          device_param->speed_pos = speed_pos;
 
-            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof (kernel_exec_timeout), &kernel_exec_timeout, NULL) == -1) return -1;
+          /**
+           * progress
+           */
 
-            device_param->kernel_exec_timeout = kernel_exec_timeout;
+          hc_thread_mutex_lock (status_ctx->mux_counter);
 
-            // CPU burning loop damper
-            // Value is given as number between 0-100
-            // By default 8%
+          status_ctx->words_progress_done[salt_pos] += perf_sum_all;
 
-            device_param->spin_damp = (double) user_options->spin_damp / 100;
+          hc_thread_mutex_unlock (status_ctx->mux_counter);
+        }
+      }
 
-            // recommend CUDA
+      /**
+       * benchmark, part2
+       */
 
-            if ((backend_ctx->cuda == NULL) || (backend_ctx->nvrtc == NULL))
-            {
-              event_log_warning (hashcat_ctx, "* Device #%u: CUDA SDK Toolkit installation NOT detected.", device_id + 1);
-              event_log_warning (hashcat_ctx, "             CUDA SDK Toolkit installation required for proper device support and utilization");
-              event_log_warning (hashcat_ctx, "             Falling back to OpenCL Runtime");
+      if (user_options->speed_only == true)
+      {
+        // let's abort this so that the user doesn't have to wait too long on the result
+        // for slow hashes it's fine anyway as boost mode should be turned on
 
-              event_log_warning (hashcat_ctx, NULL);
-            }
-          }
+        if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+        {
+          device_param->speed_only_finish = true;
+
+          break;
         }
 
-        // common driver check
+        double total_msec = device_param->speed_msec[0];
 
-        if (device_param->skipped == false)
+        for (u32 speed_pos = 1; speed_pos < device_param->speed_pos; speed_pos++)
         {
-          if ((user_options->force == false) && (user_options->backend_info == false))
+          total_msec += device_param->speed_msec[speed_pos];
+        }
+
+        if (user_options->slow_candidates == true)
+        {
+          if ((total_msec > 4000) || (device_param->speed_pos == SPEED_CACHE - 1))
           {
-            if (opencl_device_type & CL_DEVICE_TYPE_CPU)
+            const u32 speed_pos = device_param->speed_pos;
+
+            if (speed_pos)
             {
-              if (device_param->opencl_platform_vendor_id == VENDOR_ID_INTEL_SDK)
-              {
-                bool intel_warn = false;
+              device_param->speed_cnt[0]  = device_param->speed_cnt[speed_pos - 1];
+              device_param->speed_msec[0] = device_param->speed_msec[speed_pos - 1];
+            }
 
-                // Intel OpenCL runtime 18
+            device_param->speed_pos = 0;
 
-                int opencl_driver1 = 0;
-                int opencl_driver2 = 0;
-                int opencl_driver3 = 0;
-                int opencl_driver4 = 0;
+            device_param->speed_only_finish = true;
 
-                const int res18 = sscanf (device_param->opencl_driver_version, "%d.%d.%d.%d", &opencl_driver1, &opencl_driver2, &opencl_driver3, &opencl_driver4);
+            break;
+          }
+        }
+        else
+        {
+          // it's unclear if 4s is enough to turn on boost mode for all backend device
 
-                if (res18 == 4)
-                {
-                  // so far all versions 18 are ok
-                }
-                else
-                {
-                  // Intel OpenCL runtime 16
+          if ((total_msec > 4000) || (device_param->speed_pos == SPEED_CACHE - 1))
+          {
+            device_param->speed_only_finish = true;
 
-                  float opencl_version = 0;
-                  int   opencl_build   = 0;
+            break;
+          }
+        }
+      }
 
-                  const int res16 = sscanf (device_param->opencl_device_version, "OpenCL %f (Build %d)", &opencl_version, &opencl_build);
+      if (device_param->speed_only_finish == true) break;
 
-                  if (res16 == 2)
-                  {
-                    if (opencl_build < 25) intel_warn = true;
-                  }
-                }
+      /**
+       * result
+       */
 
-                if (intel_warn == true)
-                {
-                  event_log_error (hashcat_ctx, "* Device #%u: Outdated or broken Intel OpenCL runtime '%s' detected!", device_id + 1, device_param->opencl_driver_version);
+      check_cracked (hashcat_ctx, device_param, salt_pos);
 
-                  event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported Intel OpenCL runtime.");
-                  event_log_warning (hashcat_ctx, "See hashcat.net for officially supported Intel OpenCL runtime.");
-                  event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
-                  event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
-                  event_log_warning (hashcat_ctx, NULL);
+      if (status_ctx->run_thread_level2 == false) break;
+    }
 
-                  return -1;
-                }
-              }
-            }
-            else if (opencl_device_type & CL_DEVICE_TYPE_GPU)
-            {
-              if (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)
-              {
-                bool amd_warn = true;
+    if (user_options->speed_only == true) break;
 
-                #if defined (__linux__)
-                // AMDGPU-PRO Driver 16.40 and higher
-                if (strtoul (device_param->opencl_driver_version, NULL, 10) >= 2117) amd_warn = false;
-                // AMDGPU-PRO Driver 16.50 is known to be broken
-                if (strtoul (device_param->opencl_driver_version, NULL, 10) == 2236) amd_warn = true;
-                // AMDGPU-PRO Driver 16.60 is known to be broken
-                if (strtoul (device_param->opencl_driver_version, NULL, 10) == 2264) amd_warn = true;
-                // AMDGPU-PRO Driver 17.10 is known to be broken
-                if (strtoul (device_param->opencl_driver_version, NULL, 10) == 2348) amd_warn = true;
-                // AMDGPU-PRO Driver 17.20 (2416) is fine, doesn't need check will match >= 2117
-                #elif defined (_WIN)
-                // AMD Radeon Software 14.9 and higher, should be updated to 15.12
-                if (strtoul (device_param->opencl_driver_version, NULL, 10) >= 1573) amd_warn = false;
-                #else
-                // we have no information about other os
-                if (amd_warn == true) amd_warn = false;
-                #endif
+    //status screen makes use of this, can't reset here
+    //device_param->innerloop_msec = 0;
+    //device_param->innerloop_pos  = 0;
+    //device_param->innerloop_left = 0;
 
-                if (amd_warn == true)
-                {
-                  event_log_error (hashcat_ctx, "* Device #%u: Outdated or broken AMD driver '%s' detected!", device_id + 1, device_param->opencl_driver_version);
+    if (status_ctx->run_thread_level2 == false) break;
+  }
 
-                  event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported AMD driver.");
-                  event_log_warning (hashcat_ctx, "See hashcat.net for officially supported AMD drivers.");
-                  event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
-                  event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
-                  event_log_warning (hashcat_ctx, NULL);
+  //status screen makes use of this, can't reset here
+  //device_param->outerloop_msec = 0;
+  //device_param->outerloop_pos  = 0;
+  //device_param->outerloop_left = 0;
 
-                  return -1;
-                }
-              }
+  if (user_options->speed_only == true)
+  {
+    double total_msec = device_param->speed_msec[0];
 
-              if (device_param->opencl_platform_vendor_id == VENDOR_ID_NV)
-              {
-                int nv_warn = true;
+    for (u32 speed_pos = 1; speed_pos < device_param->speed_pos; speed_pos++)
+    {
+      total_msec += device_param->speed_msec[speed_pos];
+    }
 
-                int version_maj = 0;
-                int version_min = 0;
+    device_param->outerloop_msec = total_msec * hashes->salts_cnt * device_param->outerloop_multi;
 
-                const int r = sscanf (device_param->opencl_driver_version, "%d.%d", &version_maj, &version_min);
+    device_param->speed_only_finish = true;
+  }
 
-                if (r == 2)
-                {
-                  // nvidia 441.x looks ok
+  return 0;
+}
 
-                  if (version_maj == 440)
-                  {
-                    if (version_min >= 64)
-                    {
-                      nv_warn = false;
-                    }
-                  }
-                  else
-                  {
-                    // unknown version scheme, probably new driver version
+int backend_ctx_init (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
+  user_options_t *user_options = hashcat_ctx->user_options;
 
-                    nv_warn = false;
-                  }
-                }
-                else
-                {
-                  // unknown version scheme, probably new driver version
+  backend_ctx->enabled = false;
 
-                  nv_warn = false;
-                }
+  if (user_options->example_hashes == true) return 0;
+  if (user_options->keyspace       == true) return 0;
+  if (user_options->left           == true) return 0;
+  if (user_options->show           == true) return 0;
+  if (user_options->usage          == true) return 0;
+  if (user_options->version        == true) return 0;
 
-                if (nv_warn == true)
-                {
-                  event_log_warning (hashcat_ctx, "* Device #%u: Outdated or broken NVIDIA driver '%s' detected!", device_id + 1, device_param->opencl_driver_version);
-                  event_log_warning (hashcat_ctx, NULL);
+  hc_device_param_t *devices_param = (hc_device_param_t *) hccalloc (DEVICES_MAX, sizeof (hc_device_param_t));
 
-                  event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported NVIDIA driver.");
-                  event_log_warning (hashcat_ctx, "See hashcat's homepage for officially supported NVIDIA drivers.");
-                  event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
-                  event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
-                  event_log_warning (hashcat_ctx, NULL);
+  backend_ctx->devices_param = devices_param;
 
-                  return -1;
-                }
+  /**
+   * Load and map CUDA library calls, then init CUDA
+   */
 
-                if (device_param->sm_major < 5)
-                {
-                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: This hardware has outdated CUDA compute capability (%u.%u).", device_id + 1, device_param->sm_major, device_param->sm_minor);
-                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             For modern OpenCL performance, upgrade to hardware that supports");
-                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             CUDA compute capability version 5.0 (Maxwell) or higher.");
-                }
+  int rc_cuda_init = -1;
 
-                if (device_param->kernel_exec_timeout != 0)
-                {
-                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: WARNING! Kernel exec timeout is not disabled.", device_id + 1);
-                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             This may cause \"CL_OUT_OF_RESOURCES\" or related errors.");
-                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             To disable the timeout, see: https://hashcat.net/q/timeoutpatch");
-                }
-              }
-            }
-          }
+  if (user_options->backend_ignore_cuda == false)
+  {
+    CUDA_PTR *cuda = (CUDA_PTR *) hcmalloc (sizeof (CUDA_PTR));
 
-          /**
-           * activate device
-           */
+    backend_ctx->cuda = cuda;
 
-          opencl_devices_active++;
-        }
+    rc_cuda_init = cuda_init (hashcat_ctx);
 
-        /**
-         * create context for each device
-         */
+    if (rc_cuda_init == -1)
+    {
+      cuda_close (hashcat_ctx);
+    }
 
-        cl_context context;
+    /**
+     * Load and map NVRTC library calls
+     */
 
-        /*
-        cl_context_properties properties[3];
+    NVRTC_PTR *nvrtc = (NVRTC_PTR *) hcmalloc (sizeof (NVRTC_PTR));
 
-        properties[0] = CL_CONTEXT_PLATFORM;
-        properties[1] = (cl_context_properties) device_param->opencl_platform;
-        properties[2] = 0;
+    backend_ctx->nvrtc = nvrtc;
 
-        CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context);
-        */
+    int rc_nvrtc_init = nvrtc_init (hashcat_ctx);
 
-        if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context) == -1) return -1;
+    if (rc_nvrtc_init == -1)
+    {
+      nvrtc_close (hashcat_ctx);
+    }
 
-        /**
-         * create command-queue
-         */
+    /**
+     * Check if both CUDA and NVRTC were load successful
+     */
 
-        cl_command_queue command_queue;
+    if ((rc_cuda_init == 0) && (rc_nvrtc_init == 0))
+    {
+      // nvrtc version
 
-        if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1) return -1;
+      int nvrtc_major = 0;
+      int nvrtc_minor = 0;
 
-        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))
-        {
-          #define RUN_INSTRUCTION_CHECKS()
-            device_param->has_vadd     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-            device_param->has_vaddc    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-            device_param->has_vadd_co  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-            device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-            device_param->has_vsub     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-            device_param->has_vsubb    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-            device_param->has_vsub_co  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-            device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-            device_param->has_vadd3    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
-            device_param->has_vbfe     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32     %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
-            device_param->has_vperm    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+      if (hc_nvrtcVersion (hashcat_ctx, &nvrtc_major, &nvrtc_minor) == -1) return -1;
 
-          if (backend_devices_idx > 0)
-          {
-            hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
+      int nvrtc_driver_version = (nvrtc_major * 1000) + (nvrtc_minor * 10);
 
-            if (is_same_device_type (device_param, device_param_prev) == true)
-            {
-              device_param->has_vadd     = device_param_prev->has_vadd;
-              device_param->has_vaddc    = device_param_prev->has_vaddc;
-              device_param->has_vadd_co  = device_param_prev->has_vadd_co;
-              device_param->has_vaddc_co = device_param_prev->has_vaddc_co;
-              device_param->has_vsub     = device_param_prev->has_vsub;
-              device_param->has_vsubb    = device_param_prev->has_vsubb;
-              device_param->has_vsub_co  = device_param_prev->has_vsub_co;
-              device_param->has_vsubb_co = device_param_prev->has_vsubb_co;
-              device_param->has_vadd3    = device_param_prev->has_vadd3;
-              device_param->has_vbfe     = device_param_prev->has_vbfe;
-              device_param->has_vperm    = device_param_prev->has_vperm;
-            }
-            else
-            {
-              RUN_INSTRUCTION_CHECKS();
-            }
-          }
-          else
-          {
-            RUN_INSTRUCTION_CHECKS();
-          }
-
-          #undef RUN_INSTRUCTION_CHECKS
-        }
-
-        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
-        {
-          const int sm = (device_param->sm_major * 10) + device_param->sm_minor;
+      backend_ctx->nvrtc_driver_version = nvrtc_driver_version;
 
-          device_param->has_add   = (sm >= 12) ? true : false;
-          device_param->has_addc  = (sm >= 12) ? true : false;
-          device_param->has_sub   = (sm >= 12) ? true : false;
-          device_param->has_subc  = (sm >= 12) ? true : false;
-          device_param->has_bfe   = (sm >= 20) ? true : false;
-          device_param->has_lop3  = (sm >= 50) ? true : false;
-          device_param->has_mov64 = (sm >= 10) ? true : false;
-          device_param->has_prmt  = (sm >= 20) ? true : false;
+      if (nvrtc_driver_version < 9000)
+      {
+        event_log_error (hashcat_ctx, "Outdated NVIDIA NVRTC driver version '%d' detected!", nvrtc_driver_version);
 
-          /*
-          #define RUN_INSTRUCTION_CHECKS()                                                                                                                                                                                                          \
-            device_param->has_add   = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                        \
-            device_param->has_addc  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                       \
-            device_param->has_sub   = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                        \
-            device_param->has_subc  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                       \
-            device_param->has_bfe   = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                        \
-            device_param->has_lop3  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");                                    \
-            device_param->has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
-            device_param->has_prmt  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                       \
+        event_log_warning (hashcat_ctx, "See hashcat.net for officially supported NVIDIA CUDA Toolkit versions.");
+        event_log_warning (hashcat_ctx, NULL);
 
-          if (backend_devices_idx > 0)
-          {
-            hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
+        return -1;
+      }
 
-            if (is_same_device_type (device_param, device_param_prev) == true)
-            {
-              device_param->has_add   = device_param_prev->has_add;
-              device_param->has_addc  = device_param_prev->has_addc;
-              device_param->has_sub   = device_param_prev->has_sub;
-              device_param->has_subc  = device_param_prev->has_subc;
-              device_param->has_bfe   = device_param_prev->has_bfe;
-              device_param->has_lop3  = device_param_prev->has_lop3;
-              device_param->has_mov64 = device_param_prev->has_mov64;
-              device_param->has_prmt  = device_param_prev->has_prmt;
-            }
-            else
-            {
-              RUN_INSTRUCTION_CHECKS();
-            }
-          }
-          else
-          {
-            RUN_INSTRUCTION_CHECKS();
-          }
+      // cuda version
 
-          #undef RUN_INSTRUCTION_CHECKS
-          */
-        }
+      int cuda_driver_version = 0;
 
-        // device_available_mem
+      if (hc_cuDriverGetVersion (hashcat_ctx, &cuda_driver_version) == -1) return -1;
 
-        #define MAX_ALLOC_CHECKS_CNT  8192
-        #define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024)
+      backend_ctx->cuda_driver_version = cuda_driver_version;
 
-        device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE;
+      if (cuda_driver_version < 9000)
+      {
+        event_log_error (hashcat_ctx, "Outdated NVIDIA CUDA driver version '%d' detected!", cuda_driver_version);
 
-        #if defined (_WIN)
-        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
-        #else
-        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) || (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)))
-        #endif
-        {
-          // OK, so the problem here is the following:
-          // There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device,
-          // but there's no way to ask for available memory on the device.
-          // In combination, most OpenCL runtimes implementation of clCreateBuffer()
-          // are doing so called lazy memory allocation on the device.
-          // Now, if the user has X11 (or a game or anything that takes a lot of GPU memory)
-          // running on the host we end up with an error type of this:
-          // clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE
-          // The clEnqueueNDRangeKernel() is because of the lazy allocation
-          // The best way to workaround this problem is if we would be able to ask for available memory,
-          // The idea here is to try to evaluate available memory by allocating it till it errors
+        event_log_warning (hashcat_ctx, "See hashcat.net for officially supported NVIDIA CUDA Toolkit versions.");
+        event_log_warning (hashcat_ctx, NULL);
 
-          cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem));
+        return -1;
+      }
+    }
+    else
+    {
+      rc_cuda_init  = -1;
+      rc_nvrtc_init = -1;
 
-          u64 c;
+      cuda_close  (hashcat_ctx);
+      nvrtc_close (hashcat_ctx);
+    }
+  }
 
-          for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
-          {
-            if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
+/**
+   * Load and map HIP library calls, then init HIP
+   */
 
-            cl_int CL_err;
+  int rc_hip_init = -1;
 
-            OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+  if (user_options->backend_ignore_hip == false)
+  {
+    HIP_PTR *hip = (HIP_PTR *) hcmalloc (sizeof (HIP_PTR));
 
-            tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
+    backend_ctx->hip = hip;
 
-            if (CL_err != CL_SUCCESS)
-            {
-              c--;
+    rc_hip_init = hip_init (hashcat_ctx);
+    if (rc_hip_init == -1)
+    {
+      hip_close (hashcat_ctx);
+    }
 
-              break;
-            }
+    /**
+     * Load and map HIPRTC library calls
+     */
 
-            // transfer only a few byte should be enough to force the runtime to actually allocate the memory
+    HIPRTC_PTR *hiprtc = (HIPRTC_PTR *) hcmalloc (sizeof (HIPRTC_PTR));
 
-            u8 tmp_host[8];
+    backend_ctx->hiprtc = hiprtc;
 
-            if (ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+    int rc_hiprtc_init = hiprtc_init (hashcat_ctx);
 
-            if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+    if (rc_hiprtc_init == -1)
+    {
+      hiprtc_close (hashcat_ctx);
+    }
 
-            if (ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+    /**
+     * Check if both HIP and HIPRTC were load successful
+     */
 
-            if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
-          }
+    if ((rc_hip_init == 0) && (rc_hiprtc_init == 0))
+    {
+      // hiprtc version
 
-          device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
-          if (c > 0)
-          {
-            device_param->device_available_mem *= c;
-          }
+      int hiprtc_major = 0;
+      int hiprtc_minor = 0;
 
-          // clean up
+      if (hc_hiprtcVersion (hashcat_ctx, &hiprtc_major, &hiprtc_minor) == -1) return -1;
 
-          for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
-          {
-            if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
+      int hiprtc_driver_version = (hiprtc_major * 1000) + (hiprtc_minor * 10);
 
-            if (tmp_device[c] != NULL)
-            {
-              if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1;
-            }
-          }
+      backend_ctx->hiprtc_driver_version = hiprtc_driver_version;
 
-          hcfree (tmp_device);
-        }
+      if (hiprtc_driver_version < 9000)
+      {
+        event_log_error (hashcat_ctx, "Outdated AMD HIPRTC driver version '%d' detected!", hiprtc_driver_version);
 
-        hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
+        event_log_warning (hashcat_ctx, "See hashcat.net for officially supported AMD HIP versions.");
+        event_log_warning (hashcat_ctx, NULL);
 
-        hc_clReleaseContext (hashcat_ctx, context);
+        return -1;
       }
-    }
-  }
 
-  backend_ctx->opencl_devices_cnt     = opencl_devices_cnt;
-  backend_ctx->opencl_devices_active  = opencl_devices_active;
+      // hip version
 
-  // all devices combined go into backend_* variables
+      int hip_driver_version = 10000;
 
-  backend_ctx->backend_devices_cnt    = cuda_devices_cnt    + opencl_devices_cnt;
-  backend_ctx->backend_devices_active = cuda_devices_active + opencl_devices_active;
+      //if (hc_hipDriverGetVersion (hashcat_ctx, &hip_driver_version) == -1) return -1;
 
-  // find duplicate devices
+      backend_ctx->hip_driver_version = hip_driver_version;
 
-  //if ((cuda_devices_cnt > 0) && (opencl_devices_cnt > 0))
-  //{
-    // using force here enables both devices, which is the worst possible outcome
-    // many users force by default, so this is not a good idea
+      if (hip_driver_version < 9000)
+      {
+        event_log_error (hashcat_ctx, "Outdated AMD HIP driver version '%d' detected!", hip_driver_version);
 
-    //if (user_options->force == false)
-    //{
-    backend_ctx_find_alias_devices (hashcat_ctx);
-    //{
-  //}
+        event_log_warning (hashcat_ctx, "See hashcat.net for officially supported AMD HIP versions.");
+        event_log_warning (hashcat_ctx, NULL);
 
-  if (backend_ctx->backend_devices_active == 0)
-  {
-    event_log_error (hashcat_ctx, "No devices found/left.");
+        return -1;
+      }
+    }
+    else
+    {
+      rc_hip_init  = -1;
+      rc_hiprtc_init = -1;
 
-    return -1;
+      hip_close  (hashcat_ctx);
+      hiprtc_close (hashcat_ctx);
+    }
   }
 
-  // now we can calculate the number of parallel running hook threads based on
-  // the number cpu cores and the number of active compute devices
-  // unless overwritten by the user
-
-  if (user_options->hook_threads == HOOK_THREADS)
-  {
-    const u32 processor_count = hc_get_processor_count ();
+  /**
+   * Load and map OpenCL library calls
+   */
 
-    const u32 processor_count_cu = CEILDIV (processor_count, backend_ctx->backend_devices_active); // should never reach 0
+  int rc_ocl_init = -1;
 
-    user_options->hook_threads = processor_count_cu;
-  }
+  if (user_options->backend_ignore_opencl == false)
+  {
+    OCL_PTR *ocl = (OCL_PTR *) hcmalloc (sizeof (OCL_PTR));
 
-  // additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt)
+    backend_ctx->ocl = ocl;
 
-  if (backend_ctx->backend_devices_filter != (u64) -1)
-  {
-    const u64 backend_devices_cnt_mask = ~(((u64) -1 >> backend_ctx->backend_devices_cnt) << backend_ctx->backend_devices_cnt);
+    rc_ocl_init = ocl_init (hashcat_ctx);
 
-    if (backend_ctx->backend_devices_filter > backend_devices_cnt_mask)
+    if (rc_ocl_init == -1)
     {
-      event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter.");
-      event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt);
-
-      return -1;
+      ocl_close (hashcat_ctx);
     }
-  }
 
-  backend_ctx->target_msec  = TARGET_MSEC_PROFILE[user_options->workload_profile - 1];
+    /**
+     * return if CUDA, HIP and OpenCL initialization failed
+     */
 
-  backend_ctx->need_adl     = need_adl;
-  backend_ctx->need_nvml    = need_nvml;
-  backend_ctx->need_nvapi   = need_nvapi;
-  backend_ctx->need_sysfs   = need_sysfs;
+    if ((rc_hip_init == -1) && (rc_cuda_init == -1) && (rc_ocl_init == -1))
+    {
+      event_log_error (hashcat_ctx, "ATTENTION! No OpenCL, CUDA or HIP installation found.");
 
-  backend_ctx->comptime     = comptime;
+      event_log_warning (hashcat_ctx, "You are probably missing the CUDA, HIP or OpenCL runtime installation.");
+      event_log_warning (hashcat_ctx, NULL);
 
-  return 0;
-}
+      #if defined (__linux__)
+      event_log_warning (hashcat_ctx, "* AMD GPUs on Linux require this driver:");
+      event_log_warning (hashcat_ctx, "  \"RadeonOpenCompute (ROCm)\" Software Platform (3.1 or later)");
+      #elif defined (_WIN)
+      event_log_warning (hashcat_ctx, "* AMD GPUs on Windows require this driver:");
+      event_log_warning (hashcat_ctx, "  \"AMD Radeon Adrenalin 2020 Edition\" (20.2.2 or later)");
+      #endif
 
-void backend_ctx_devices_destroy (hashcat_ctx_t *hashcat_ctx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+      event_log_warning (hashcat_ctx, "* Intel CPUs require this runtime:");
+      event_log_warning (hashcat_ctx, "  \"OpenCL Runtime for Intel Core and Intel Xeon Processors\" (16.1.1 or later)");
 
-  if (backend_ctx->enabled == false) return;
+      event_log_warning (hashcat_ctx, "* NVIDIA GPUs require this runtime and/or driver (both):");
+      event_log_warning (hashcat_ctx, "  \"NVIDIA Driver\" (440.64 or later)");
+      event_log_warning (hashcat_ctx, "  \"CUDA Toolkit\" (9.0 or later)");
+      event_log_warning (hashcat_ctx, NULL);
 
-  for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < backend_ctx->opencl_platforms_cnt; opencl_platforms_idx++)
-  {
-    hcfree (backend_ctx->opencl_platforms_devices[opencl_platforms_idx]);
-    hcfree (backend_ctx->opencl_platforms_name[opencl_platforms_idx]);
-    hcfree (backend_ctx->opencl_platforms_vendor[opencl_platforms_idx]);
-    hcfree (backend_ctx->opencl_platforms_version[opencl_platforms_idx]);
-  }
-
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
-  {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
-
-    if (device_param->skipped == true) continue;
-
-    hcfree (device_param->device_name);
-
-    if (device_param->is_opencl == true)
-    {
-      hcfree (device_param->opencl_driver_version);
-      hcfree (device_param->opencl_device_version);
-      hcfree (device_param->opencl_device_c_version);
-      hcfree (device_param->opencl_device_vendor);
+      return -1;
     }
-  }
-
-  backend_ctx->backend_devices_cnt    = 0;
-  backend_ctx->backend_devices_active = 0;
-  backend_ctx->cuda_devices_cnt       = 0;
-  backend_ctx->cuda_devices_active    = 0;
-  backend_ctx->opencl_devices_cnt     = 0;
-  backend_ctx->opencl_devices_active  = 0;
-
-  backend_ctx->need_adl    = false;
-  backend_ctx->need_nvml   = false;
-  backend_ctx->need_nvapi  = false;
-  backend_ctx->need_sysfs  = false;
-}
-
-void backend_ctx_devices_sync_tuning (hashcat_ctx_t *hashcat_ctx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  if (backend_ctx->enabled == false) return;
-
-  for (int backend_devices_cnt_src = 0; backend_devices_cnt_src < backend_ctx->backend_devices_cnt; backend_devices_cnt_src++)
-  {
-    hc_device_param_t *device_param_src = &backend_ctx->devices_param[backend_devices_cnt_src];
-
-    if (device_param_src->skipped == true) continue;
-
-    if (device_param_src->skipped_warning == true) continue;
-
-    for (int backend_devices_cnt_dst = backend_devices_cnt_src + 1; backend_devices_cnt_dst < backend_ctx->backend_devices_cnt; backend_devices_cnt_dst++)
-    {
-      hc_device_param_t *device_param_dst = &backend_ctx->devices_param[backend_devices_cnt_dst];
 
-      if (device_param_dst->skipped == true) continue;
+    /**
+     * Some permission pre-check, because AMDGPU-PRO Driver crashes if the user has no permission to do this
+     */
 
-      if (device_param_dst->skipped_warning == true) continue;
+    if (ocl_check_dri (hashcat_ctx) == -1) return -1;
+  }
 
-      if (is_same_device_type (device_param_src, device_param_dst) == false) continue;
+  /**
+   * Backend device selection
+   */
 
-      device_param_dst->kernel_accel   = device_param_src->kernel_accel;
-      device_param_dst->kernel_loops   = device_param_src->kernel_loops;
-      device_param_dst->kernel_threads = device_param_src->kernel_threads;
+  u64 backend_devices_filter;
 
-      const u32 hardware_power = device_param_dst->device_processors * device_param_dst->kernel_threads;
+  if (setup_backend_devices_filter (hashcat_ctx, user_options->backend_devices, &backend_devices_filter) == false) return -1;
 
-      device_param_dst->hardware_power = hardware_power;
+  backend_ctx->backend_devices_filter = backend_devices_filter;
 
-      const u32 kernel_power = device_param_dst->hardware_power * device_param_dst->kernel_accel;
+  /**
+   * OpenCL device type selection
+   */
 
-      device_param_dst->kernel_power = kernel_power;
-    }
-  }
-}
+  cl_device_type opencl_device_types_filter;
 
-void backend_ctx_devices_update_power (hashcat_ctx_t *hashcat_ctx)
-{
-  backend_ctx_t        *backend_ctx         = hashcat_ctx->backend_ctx;
-  status_ctx_t         *status_ctx          = hashcat_ctx->status_ctx;
-  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
-  user_options_t       *user_options        = hashcat_ctx->user_options;
+  if (setup_opencl_device_types_filter (hashcat_ctx, user_options->opencl_device_types, &opencl_device_types_filter) == false) return -1;
 
-  if (backend_ctx->enabled == false) return;
+  backend_ctx->opencl_device_types_filter = opencl_device_types_filter;
 
-  u32 kernel_power_all = 0;
+  /**
+   * CUDA API: init
+   */
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  if (backend_ctx->cuda)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
-
-    if (device_param->skipped == true) continue;
-
-    if (device_param->skipped_warning == true) continue;
-
-    kernel_power_all += device_param->kernel_power;
+    if (hc_cuInit (hashcat_ctx, 0) == -1)
+    {
+      cuda_close (hashcat_ctx);
+    }
   }
 
-  backend_ctx->kernel_power_all = kernel_power_all;
-
-  /*
-   * Inform user about possible slow speeds
+  /**
+   * HIP API: init
    */
 
-  if ((user_options_extra->wordlist_mode == WL_MODE_FILE) || (user_options_extra->wordlist_mode == WL_MODE_MASK))
+  if (backend_ctx->hip)
   {
-    if (status_ctx->words_base < kernel_power_all)
+    if (hc_hipInit (hashcat_ctx, 0) == -1)
     {
-      if (user_options->quiet == false)
-      {
-        event_log_advice (hashcat_ctx, "The wordlist or mask that you are using is too small.");
-        event_log_advice (hashcat_ctx, "This means that hashcat cannot use the full parallel power of your device(s).");
-        event_log_advice (hashcat_ctx, "Unless you supply more work, your cracking speed will drop.");
-        event_log_advice (hashcat_ctx, "For tips on supplying more work, see: https://hashcat.net/faq/morework");
-        event_log_advice (hashcat_ctx, NULL);
-      }
+      hip_close (hashcat_ctx);
     }
   }
-}
-
-void backend_ctx_devices_kernel_loops (hashcat_ctx_t *hashcat_ctx)
-{
-  combinator_ctx_t     *combinator_ctx      = hashcat_ctx->combinator_ctx;
-  hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
-  hashes_t             *hashes              = hashcat_ctx->hashes;
-  mask_ctx_t           *mask_ctx            = hashcat_ctx->mask_ctx;
-  backend_ctx_t        *backend_ctx         = hashcat_ctx->backend_ctx;
-  straight_ctx_t       *straight_ctx        = hashcat_ctx->straight_ctx;
-  user_options_t       *user_options        = hashcat_ctx->user_options;
-  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
 
-  if (backend_ctx->enabled == false) return;
+  /**
+   * OpenCL API: init
+   */
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  if (backend_ctx->ocl)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
-
-    if (device_param->skipped == true) continue;
-
-    if (device_param->skipped_warning == true) continue;
+    #define FREE_OPENCL_CTX_ON_ERROR          \
+    do {                                      \
+      hcfree (opencl_platforms);              \
+      hcfree (opencl_platforms_devices);      \
+      hcfree (opencl_platforms_devices_cnt);  \
+      hcfree (opencl_platforms_name);         \
+      hcfree (opencl_platforms_vendor);       \
+      hcfree (opencl_platforms_vendor_id);    \
+      hcfree (opencl_platforms_version);      \
+    } while (0)
 
-    device_param->kernel_loops_min = device_param->kernel_loops_min_sav;
-    device_param->kernel_loops_max = device_param->kernel_loops_max_sav;
+    cl_platform_id *opencl_platforms             = (cl_platform_id *) hccalloc (CL_PLATFORMS_MAX, sizeof (cl_platform_id));
+    cl_uint         opencl_platforms_cnt         = 0;
+    cl_device_id  **opencl_platforms_devices     = (cl_device_id **)  hccalloc (CL_PLATFORMS_MAX, sizeof (cl_device_id *));
+    cl_uint        *opencl_platforms_devices_cnt = (cl_uint *)        hccalloc (CL_PLATFORMS_MAX, sizeof (cl_uint));
+    char          **opencl_platforms_name        = (char **)          hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
+    char          **opencl_platforms_vendor      = (char **)          hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
+    cl_uint        *opencl_platforms_vendor_id   = (cl_uint *)        hccalloc (CL_PLATFORMS_MAX, sizeof (cl_uint));
+    char          **opencl_platforms_version     = (char **)          hccalloc (CL_PLATFORMS_MAX, sizeof (char *));
 
-    if (device_param->kernel_loops_min < device_param->kernel_loops_max)
+    if (hc_clGetPlatformIDs (hashcat_ctx, CL_PLATFORMS_MAX, opencl_platforms, &opencl_platforms_cnt) == -1)
     {
-      u32 innerloop_cnt = 0;
+      opencl_platforms_cnt = 0;
 
-      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-      {
-        if (user_options->slow_candidates == true)
-        {
-          innerloop_cnt = 1;
-        }
-        else
-        {
-          if      (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)  innerloop_cnt = MIN (KERNEL_RULES, (u32) straight_ctx->kernel_rules_cnt);
-          else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)     innerloop_cnt = MIN (KERNEL_COMBS, (u32) combinator_ctx->combs_cnt);
-          else if (user_options_extra->attack_kern == ATTACK_KERN_BF)        innerloop_cnt = MIN (KERNEL_BFS,   (u32) mask_ctx->bfs_cnt);
-        }
-      }
-      else
-      {
-        innerloop_cnt = hashes->salts_buf[0].salt_iter;
-      }
+      FREE_OPENCL_CTX_ON_ERROR;
 
-      if ((innerloop_cnt >= device_param->kernel_loops_min) &&
-          (innerloop_cnt <= device_param->kernel_loops_max))
-      {
-        device_param->kernel_loops_max = innerloop_cnt;
-      }
+      ocl_close (hashcat_ctx);
     }
-  }
-}
 
-static int get_cuda_kernel_wgs (hashcat_ctx_t *hashcat_ctx, CUfunction function, u32 *result)
-{
-  int max_threads_per_block;
+    if (opencl_platforms_cnt)
+    {
+      for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
+      {
+        cl_platform_id opencl_platform = opencl_platforms[opencl_platforms_idx];
 
-  if (hc_cuFuncGetAttribute (hashcat_ctx, &max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function) == -1) return -1;
+        size_t param_value_size = 0;
 
-  *result = (u32) max_threads_per_block;
+        // platform vendor
 
-  return 0;
-}
+        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VENDOR, 0, NULL, &param_value_size) == -1) return -1;
 
-static int get_cuda_kernel_local_mem_size (hashcat_ctx_t *hashcat_ctx, CUfunction function, u64 *result)
-{
-  int shared_size_bytes;
+        char *opencl_platform_vendor = (char *) hcmalloc (param_value_size);
 
-  if (hc_cuFuncGetAttribute (hashcat_ctx, &shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, function) == -1) return -1;
+        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VENDOR, param_value_size, opencl_platform_vendor, NULL) == -1) return -1;
 
-  *result = (u64) shared_size_bytes;
+        opencl_platforms_vendor[opencl_platforms_idx] = opencl_platform_vendor;
 
-  return 0;
-}
+        // platform name
 
-static int get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx, CUfunction function, u64 *result)
-{
-  // AFAIK there's no way to query the maximum value for dynamic shared memory available (because it depends on kernel code).
-  // let's brute force it, therefore workaround the hashcat wrapper of cuFuncSetAttribute()
+        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_NAME, 0, NULL, &param_value_size) == -1) return -1;
 
-  #define MAX_ASSUMED_SHARED (1024 * 1024)
+        char *opencl_platform_name = (char *) hcmalloc (param_value_size);
 
-  u64 dynamic_shared_size_bytes = 0;
+        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_NAME, param_value_size, opencl_platform_name, NULL) == -1) return -1;
 
-  for (int i = 1; i <= MAX_ASSUMED_SHARED; i++)
-  {
-    backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+        opencl_platforms_name[opencl_platforms_idx] = opencl_platform_name;
 
-    CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+        // platform version
 
-    const CUresult CU_err = cuda->cuFuncSetAttribute (function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, i);
+        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VERSION, 0, NULL, &param_value_size) == -1) return -1;
 
-    if (CU_err == CUDA_SUCCESS)
-    {
-      dynamic_shared_size_bytes = i;
+        char *opencl_platform_version = (char *) hcmalloc (param_value_size);
 
-      continue;
-    }
+        if (hc_clGetPlatformInfo (hashcat_ctx, opencl_platform, CL_PLATFORM_VERSION, param_value_size, opencl_platform_version, NULL) == -1) return -1;
 
-    break;
-  }
+        opencl_platforms_version[opencl_platforms_idx] = opencl_platform_version;
 
-  *result = dynamic_shared_size_bytes;
+        // find our own platform vendor because pocl and mesa are pushing original vendor_id through opencl
+        // this causes trouble with vendor id based macros
+        // we'll assign generic to those without special optimization available
 
-  if (hc_cuFuncSetAttribute (hashcat_ctx, function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 0) == -1) return -1;
+        cl_uint opencl_platform_vendor_id = 0;
 
-  return 0;
-}
+        if (strcmp (opencl_platform_vendor, CL_VENDOR_AMD1) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_AMD;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_AMD2) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_AMD;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_AMD_USE_INTEL) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_AMD_USE_INTEL;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_APPLE) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_APPLE;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_INTEL_BEIGNET) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_INTEL_BEIGNET;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_INTEL_SDK) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_INTEL_SDK;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_MESA) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_MESA;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_NV) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_NV;
+        }
+        else if (strcmp (opencl_platform_vendor, CL_VENDOR_POCL) == 0)
+        {
+          opencl_platform_vendor_id = VENDOR_ID_POCL;
+        }
+        else
+        {
+          opencl_platform_vendor_id = VENDOR_ID_GENERIC;
+        }
 
-static int get_opencl_kernel_wgs (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u32 *result)
-{
-  size_t work_group_size = 0;
+        opencl_platforms_vendor_id[opencl_platforms_idx] = opencl_platform_vendor_id;
 
-  if (hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (work_group_size), &work_group_size, NULL) == -1) return -1;
+        cl_device_id *opencl_platform_devices = (cl_device_id *) hccalloc (DEVICES_MAX, sizeof (cl_device_id));
 
-  u32 kernel_threads = (u32) work_group_size;
+        cl_uint opencl_platform_devices_cnt = 0;
 
-  size_t compile_work_group_size[3] = { 0, 0, 0 };
+        const int CL_rc = hc_clGetDeviceIDs (hashcat_ctx, opencl_platform, CL_DEVICE_TYPE_ALL, DEVICES_MAX, opencl_platform_devices, &opencl_platform_devices_cnt);
 
-  if (hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof (compile_work_group_size), &compile_work_group_size, NULL) == -1) return -1;
+        if (CL_rc == -1)
+        {
+          event_log_error (hashcat_ctx, "clGetDeviceIDs(): %s", val2cstr_cl (CL_rc));
 
-  const size_t cwgs_total = compile_work_group_size[0] * compile_work_group_size[1] * compile_work_group_size[2];
+          // Special handling for CL_DEVICE_NOT_FOUND, see: https://github.com/hashcat/hashcat/issues/2455
 
-  if (cwgs_total > 0)
-  {
-    kernel_threads = MIN (kernel_threads, (u32) cwgs_total);
-  }
+          #define IGNORE_DEVICE_NOT_FOUND 1
 
-  *result = kernel_threads;
+          if (IGNORE_DEVICE_NOT_FOUND)
+          {
+            backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  return 0;
-}
+            OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-static int get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u32 *result)
-{
-  size_t preferred_work_group_size_multiple = 0;
+            const cl_int CL_err = ocl->clGetDeviceIDs (opencl_platform, CL_DEVICE_TYPE_ALL, DEVICES_MAX, opencl_platform_devices, &opencl_platform_devices_cnt);
 
-  if (hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof (preferred_work_group_size_multiple), &preferred_work_group_size_multiple, NULL) == -1) return -1;
+            if (CL_err == CL_DEVICE_NOT_FOUND)
+            {
+              // we ignore this error
+            }
+            else
+            {
+              return -1;
+            }
+          }
+          else
+          {
+            return -1;
+          }
+        }
 
-  *result = (u32) preferred_work_group_size_multiple;
+        opencl_platforms_devices[opencl_platforms_idx] = opencl_platform_devices;
 
-  return 0;
-}
+        opencl_platforms_devices_cnt[opencl_platforms_idx] = opencl_platform_devices_cnt;
+      }
 
-static int get_opencl_kernel_local_mem_size (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u64 *result)
-{
-  cl_ulong local_mem_size = 0;
+      if (user_options->opencl_device_types == NULL)
+      {
+        /**
+         * OpenCL device types:
+         *   In case the user did not specify --opencl-device-types and the user runs hashcat in a system with only a CPU only he probably want to use that CPU.
+         */
 
-  if (hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof (local_mem_size), &local_mem_size, NULL) == -1) return -1;
+        cl_device_type opencl_device_types_all = 0;
 
-  *result = local_mem_size;
+        for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
+        {
+          cl_device_id *opencl_platform_devices     = opencl_platforms_devices[opencl_platforms_idx];
+          cl_uint       opencl_platform_devices_cnt = opencl_platforms_devices_cnt[opencl_platforms_idx];
 
-  return 0;
-}
+          for (u32 opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++)
+          {
+            cl_device_id opencl_device = opencl_platform_devices[opencl_platform_devices_idx];
 
-static int get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u64 *result)
-{
-  cl_ulong dynamic_local_mem_size = 0;
+            cl_device_type opencl_device_type;
 
-  if (hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof (dynamic_local_mem_size), &dynamic_local_mem_size, NULL) == -1) return -1;
+            if (hc_clGetDeviceInfo (hashcat_ctx, opencl_device, CL_DEVICE_TYPE, sizeof (opencl_device_type), &opencl_device_type, NULL) == -1)
+            {
+              FREE_OPENCL_CTX_ON_ERROR;
 
-  // unknown how to query this information in OpenCL
-  // we therefore reset to zero
-  // the above call to hc_clGetKernelWorkGroupInfo() is just to avoid compiler warnings
+              return -1;
+            }
 
-  dynamic_local_mem_size = 0;
+            opencl_device_types_all |= opencl_device_type;
+          }
+        }
 
-  *result = dynamic_local_mem_size;
+        // In such a case, automatically enable CPU device type support, since it's disabled by default.
 
-  return 0;
-}
+        if ((opencl_device_types_all & (CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR)) == 0)
+        {
+          opencl_device_types_filter |= CL_DEVICE_TYPE_CPU;
+        }
 
-static u32 get_kernel_threads (const hc_device_param_t *device_param)
-{
-  // this is an upper limit, a good start, since our strategy is to reduce thread counts only.
+        // In another case, when the user uses --stdout, using CPU devices is much faster to setup
+        // If we have a CPU device, force it to be used
 
-  u32 kernel_threads_min = device_param->kernel_threads_min;
-  u32 kernel_threads_max = device_param->kernel_threads_max;
+        if (user_options->stdout_flag == true)
+        {
+          if (opencl_device_types_all & CL_DEVICE_TYPE_CPU)
+          {
+            opencl_device_types_filter = CL_DEVICE_TYPE_CPU;
+          }
+        }
 
-  // the changes we do here are just optimizations, since the module always has priority.
+        backend_ctx->opencl_device_types_filter = opencl_device_types_filter;
+      }
+    }
 
-  const u32 device_maxworkgroup_size = (const u32) device_param->device_maxworkgroup_size;
+    backend_ctx->opencl_platforms             = opencl_platforms;
+    backend_ctx->opencl_platforms_cnt         = opencl_platforms_cnt;
+    backend_ctx->opencl_platforms_devices     = opencl_platforms_devices;
+    backend_ctx->opencl_platforms_devices_cnt = opencl_platforms_devices_cnt;
+    backend_ctx->opencl_platforms_name        = opencl_platforms_name;
+    backend_ctx->opencl_platforms_vendor      = opencl_platforms_vendor;
+    backend_ctx->opencl_platforms_vendor_id   = opencl_platforms_vendor_id;
+    backend_ctx->opencl_platforms_version     = opencl_platforms_version;
 
-  kernel_threads_max = MIN (kernel_threads_max, device_maxworkgroup_size);
+    #undef FREE_OPENCL_CTX_ON_ERROR
+  }
 
-  if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+  /**
+   * Final checks
+   */
+
+  if ((backend_ctx->hip == NULL) && (backend_ctx->cuda == NULL) && (backend_ctx->ocl == NULL))
   {
-    // for all CPU we just do 1 ...
+    event_log_error (hashcat_ctx, "ATTENTION! No OpenCL-compatible, CUDA-compatible or HIP-compatible platform found.");
 
-    const u32 cpu_prefered_thread_count = 1;
+    event_log_warning (hashcat_ctx, "You are probably missing the OpenCL, CUDA or HIP runtime installation.");
+    event_log_warning (hashcat_ctx, NULL);
 
-    kernel_threads_max = MIN (kernel_threads_max, cpu_prefered_thread_count);
-  }
-  else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-  {
-    // for GPU we need to distinguish by vendor
+    #if defined (__linux__)
+    event_log_warning (hashcat_ctx, "* AMD GPUs on Linux require this driver:");
+    event_log_warning (hashcat_ctx, "  \"RadeonOpenCompute (ROCm)\" Software Platform (3.1 or later)");
+    #elif defined (_WIN)
+    event_log_warning (hashcat_ctx, "* AMD GPUs on Windows require this driver:");
+    event_log_warning (hashcat_ctx, "  \"AMD Radeon Adrenalin 2020 Edition\" (20.2.2 or later)");
+    #endif
 
-    if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK)
-    {
-      const u32 gpu_prefered_thread_count = 8;
+    event_log_warning (hashcat_ctx, "* Intel CPUs require this runtime:");
+    event_log_warning (hashcat_ctx, "  \"OpenCL Runtime for Intel Core and Intel Xeon Processors\" (16.1.1 or later)");
 
-      kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count);
-    }
-    else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
-    {
-      const u32 gpu_prefered_thread_count = 64;
+    event_log_warning (hashcat_ctx, "* NVIDIA GPUs require this runtime and/or driver (both):");
+    event_log_warning (hashcat_ctx, "  \"NVIDIA Driver\" (440.64 or later)");
+    event_log_warning (hashcat_ctx, "  \"CUDA Toolkit\" (9.0 or later)");
+    event_log_warning (hashcat_ctx, NULL);
 
-      kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count);
-    }
+    return -1;
   }
 
-  // this is intenionally! at this point, kernel_threads_min can be higher than kernel_threads_max.
-  // in this case we actually want kernel_threads_min selected.
-
-  const u32 kernel_threads = MAX (kernel_threads_min, kernel_threads_max);
+  backend_ctx->enabled = true;
 
-  return kernel_threads;
+  return 0;
 }
 
-static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const char *kernel_name, char *source_file, char *cached_file, const char *build_options_buf, const bool cache_disable, cl_program *opencl_program, CUmodule *cuda_module)
+void backend_ctx_destroy (hashcat_ctx_t *hashcat_ctx)
 {
-  const hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
-
-  bool cached = true;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
-  if (cache_disable == true)
-  {
-    cached = false;
-  }
+  if (backend_ctx->enabled == false) return;
 
-  if (hc_path_read (cached_file) == false)
-  {
-    cached = false;
-  }
+  hcfree (backend_ctx->devices_param);
 
-  if (hc_path_is_empty (cached_file) == true)
+  if (backend_ctx->ocl)
   {
-    cached = false;
+    hcfree (backend_ctx->opencl_platforms);
+    hcfree (backend_ctx->opencl_platforms_devices);
+    hcfree (backend_ctx->opencl_platforms_devices_cnt);
+    hcfree (backend_ctx->opencl_platforms_name);
+    hcfree (backend_ctx->opencl_platforms_vendor);
+    hcfree (backend_ctx->opencl_platforms_vendor_id);
+    hcfree (backend_ctx->opencl_platforms_version);
   }
 
-  /**
-   * kernel compile or load
-   */
+  nvrtc_close (hashcat_ctx);
+  cuda_close  (hashcat_ctx);
+  hiprtc_close (hashcat_ctx);
+  hip_close  (hashcat_ctx);
+  ocl_close   (hashcat_ctx);
 
-  size_t kernel_lengths_buf = 0;
+  memset (backend_ctx, 0, sizeof (backend_ctx_t));
+}
 
-  size_t *kernel_lengths = &kernel_lengths_buf;
+int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
+{
+  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
+  user_options_t *user_options = hashcat_ctx->user_options;
 
-  char *kernel_sources_buf = NULL;
+  if (backend_ctx->enabled == false) return 0;
 
-  char **kernel_sources = &kernel_sources_buf;
+  hc_device_param_t *devices_param = backend_ctx->devices_param;
 
-  if (cached == false)
-  {
-    #if defined (DEBUG)
-    const user_options_t *user_options = hashcat_ctx->user_options;
+  bool need_adl     = false;
+  bool need_nvml    = false;
+  bool need_nvapi   = false;
+  bool need_sysfs   = false;
 
-    if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Kernel %s not found in cache! Building may take a while...", device_param->device_id + 1, filename_from_filepath (cached_file));
-    #endif
+  int backend_devices_idx = 0;
 
-    if (read_kernel_binary (hashcat_ctx, source_file, kernel_lengths, kernel_sources) == false) return false;
+  int cuda_devices_cnt    = 0;
+  int cuda_devices_active = 0;
 
-    if (device_param->is_cuda == true)
+  if (backend_ctx->cuda)
+  {
+    // device count
+
+    if (hc_cuDeviceGetCount (hashcat_ctx, &cuda_devices_cnt) == -1)
     {
-      nvrtcProgram program;
+      cuda_close (hashcat_ctx);
+    }
 
-      if (hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], kernel_name, 0, NULL, NULL) == -1) return false;
+    backend_ctx->cuda_devices_cnt = cuda_devices_cnt;
 
-      char **nvrtc_options = (char **) hccalloc (4 + strlen (build_options_buf) + 1, sizeof (char *)); // ...
+    // device specific
 
-      nvrtc_options[0] = "--restrict";
-      nvrtc_options[1] = "--device-as-default-execution-space";
-      nvrtc_options[2] = "--gpu-architecture";
+    for (int cuda_devices_idx = 0; cuda_devices_idx < cuda_devices_cnt; cuda_devices_idx++, backend_devices_idx++)
+    {
+      const u32 device_id = backend_devices_idx;
 
-      hc_asprintf (&nvrtc_options[3], "compute_%d%d", device_param->sm_major, device_param->sm_minor);
+      hc_device_param_t *device_param = &devices_param[backend_devices_idx];
 
-      char *nvrtc_options_string = hcstrdup (build_options_buf);
+      device_param->device_id = device_id;
 
-      const int num_options = 4 + nvrtc_make_options_array_from_string (nvrtc_options_string, nvrtc_options + 4);
+      backend_ctx->backend_device_from_cuda[cuda_devices_idx] = backend_devices_idx;
 
-      const int rc_nvrtcCompileProgram = hc_nvrtcCompileProgram (hashcat_ctx, program, num_options, (const char * const *) nvrtc_options);
+      CUdevice cuda_device;
 
-      size_t build_log_size = 0;
+      if (hc_cuDeviceGet (hashcat_ctx, &cuda_device, cuda_devices_idx) == -1) return -1;
 
-      hc_nvrtcGetProgramLogSize (hashcat_ctx, program, &build_log_size);
+      device_param->cuda_device = cuda_device;
 
-      #if defined (DEBUG)
-      if ((build_log_size > 1) || (rc_nvrtcCompileProgram == -1))
-      #else
-      if (rc_nvrtcCompileProgram == -1)
-      #endif
-      {
-        char *build_log = (char *) hcmalloc (build_log_size + 1);
+      device_param->is_cuda = true;
 
-        if (hc_nvrtcGetProgramLog (hashcat_ctx, program, build_log) == -1) return false;
+      device_param->is_opencl = false;
 
-        puts (build_log);
+      device_param->use_opencl12 = false;
+      device_param->use_opencl20 = false;
+      device_param->use_opencl21 = false;
 
-        hcfree (build_log);
-      }
+      // device_name
 
-      if (rc_nvrtcCompileProgram == -1)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
+      char *device_name = (char *) hcmalloc (HCBUFSIZ_TINY);
 
-        return false;
-      }
+      if (hc_cuDeviceGetName (hashcat_ctx, device_name, HCBUFSIZ_TINY, cuda_device) == -1) return -1;
 
-      hcfree (nvrtc_options);
-      hcfree (nvrtc_options_string);
+      device_param->device_name = device_name;
 
-      size_t binary_size = 0;
+      hc_string_trim_leading (device_name);
 
-      if (hc_nvrtcGetPTXSize (hashcat_ctx, program, &binary_size) == -1) return false;
+      hc_string_trim_trailing (device_name);
 
-      char *binary = (char *) hcmalloc (binary_size);
+      // device_processors
 
-      if (hc_nvrtcGetPTX (hashcat_ctx, program, binary) == -1) return false;
+      int device_processors = 0;
 
-      if (hc_nvrtcDestroyProgram (hashcat_ctx, &program) == -1) return false;
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_processors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuda_device) == -1) return -1;
 
-      #define LOG_SIZE 8192
+      device_param->device_processors = device_processors;
 
-      char *mod_info_log  = (char *) hcmalloc (LOG_SIZE + 1);
-      char *mod_error_log = (char *) hcmalloc (LOG_SIZE + 1);
+      // device_global_mem, device_maxmem_alloc, device_available_mem
 
-      int mod_cnt = 6;
+      size_t bytes = 0;
 
-      CUjit_option mod_opts[7];
-      void *mod_vals[7];
+      if (hc_cuDeviceTotalMem (hashcat_ctx, &bytes, cuda_device) == -1) return -1;
 
-      mod_opts[0] = CU_JIT_TARGET_FROM_CUCONTEXT;
-      mod_vals[0] = (void *) 0;
+      device_param->device_global_mem = (u64) bytes;
 
-      mod_opts[1] = CU_JIT_LOG_VERBOSE;
-      mod_vals[1] = (void *) 1;
+      device_param->device_maxmem_alloc = (u64) bytes;
 
-      mod_opts[2] = CU_JIT_INFO_LOG_BUFFER;
-      mod_vals[2] = (void *) mod_info_log;
+      device_param->device_available_mem = 0;
 
-      mod_opts[3] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
-      mod_vals[3] = (void *) LOG_SIZE;
+      // warp size
 
-      mod_opts[4] = CU_JIT_ERROR_LOG_BUFFER;
-      mod_vals[4] = (void *) mod_error_log;
+      int cuda_warp_size = 0;
 
-      mod_opts[5] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
-      mod_vals[5] = (void *) LOG_SIZE;
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &cuda_warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuda_device) == -1) return -1;
 
-      if (hashconfig->opti_type & OPTI_TYPE_REGISTER_LIMIT)
-      {
-        mod_opts[6] = CU_JIT_MAX_REGISTERS;
-        mod_vals[6] = (void *) 128;
+      device_param->cuda_warp_size = cuda_warp_size;
 
-        mod_cnt++;
-      }
+      // sm_minor, sm_major
 
-      #if defined (WITH_CUBIN)
+      int sm_major = 0;
+      int sm_minor = 0;
 
-      char *jit_info_log  = (char *) hcmalloc (LOG_SIZE + 1);
-      char *jit_error_log = (char *) hcmalloc (LOG_SIZE + 1);
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &sm_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuda_device) == -1) return -1;
 
-      int jit_cnt = 6;
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &sm_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuda_device) == -1) return -1;
 
-      CUjit_option jit_opts[7];
-      void *jit_vals[7];
+      device_param->sm_major = sm_major;
+      device_param->sm_minor = sm_minor;
 
-      jit_opts[0] = CU_JIT_TARGET_FROM_CUCONTEXT;
-      jit_vals[0] = (void *) 0;
+      // device_maxworkgroup_size
 
-      jit_opts[1] = CU_JIT_LOG_VERBOSE;
-      jit_vals[1] = (void *) 1;
+      int device_maxworkgroup_size = 0;
 
-      jit_opts[2] = CU_JIT_INFO_LOG_BUFFER;
-      jit_vals[2] = (void *) jit_info_log;
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_maxworkgroup_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuda_device) == -1) return -1;
 
-      jit_opts[3] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
-      jit_vals[3] = (void *) LOG_SIZE;
+      device_param->device_maxworkgroup_size = device_maxworkgroup_size;
 
-      jit_opts[4] = CU_JIT_ERROR_LOG_BUFFER;
-      jit_vals[4] = (void *) jit_error_log;
+      // max_clock_frequency
 
-      jit_opts[5] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
-      jit_vals[5] = (void *) LOG_SIZE;
+      int device_maxclock_frequency = 0;
 
-      if (hashconfig->opti_type & OPTI_TYPE_REGISTER_LIMIT)
-      {
-        jit_opts[6] = CU_JIT_MAX_REGISTERS;
-        jit_vals[6] = (void *) 128;
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_maxclock_frequency, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cuda_device) == -1) return -1;
 
-        jit_cnt++;
-      }
+      device_param->device_maxclock_frequency = device_maxclock_frequency / 1000;
 
-      CUlinkState state;
+      // pcie_bus, pcie_device, pcie_function
 
-      if (hc_cuLinkCreate (hashcat_ctx, jit_cnt, jit_opts, jit_vals, &state) == -1)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s link failed. Error Log:", device_param->device_id + 1, source_file);
-        event_log_error (hashcat_ctx, "%s", jit_error_log);
-        event_log_error (hashcat_ctx, NULL);
+      int pci_domain_id_nv  = 0;
+      int pci_bus_id_nv     = 0;
+      int pci_slot_id_nv    = 0;
 
-        return false;
-      }
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &pci_domain_id_nv, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cuda_device) == -1) return -1;
 
-      if (hc_cuLinkAddData (hashcat_ctx, state, CU_JIT_INPUT_PTX, binary, binary_size, kernel_name, 0, NULL, NULL) == -1)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s link failed. Error Log:", device_param->device_id + 1, source_file);
-        event_log_error (hashcat_ctx, "%s", jit_error_log);
-        event_log_error (hashcat_ctx, NULL);
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &pci_bus_id_nv, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cuda_device) == -1) return -1;
 
-        return false;
-      }
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &pci_slot_id_nv, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cuda_device) == -1) return -1;
 
-      void *cubin = NULL;
+      device_param->pcie_domain   = (u8) (pci_domain_id_nv);
+      device_param->pcie_bus      = (u8) (pci_bus_id_nv);
+      device_param->pcie_device   = (u8) (pci_slot_id_nv >> 3);
+      device_param->pcie_function = (u8) (pci_slot_id_nv & 7);
 
-      size_t cubin_size = 0;
+      // kernel_exec_timeout
 
-      if (hc_cuLinkComplete (hashcat_ctx, state, &cubin, &cubin_size) == -1)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s link failed. Error Log:", device_param->device_id + 1, source_file);
-        event_log_error (hashcat_ctx, "%s", jit_error_log);
-        event_log_error (hashcat_ctx, NULL);
+      int kernel_exec_timeout = 0;
 
-        return false;
-      }
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &kernel_exec_timeout, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, cuda_device) == -1) return -1;
 
-      #if defined (DEBUG)
-      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s link successful. Info Log:", device_param->device_id + 1, source_file);
-      event_log_info (hashcat_ctx, "%s", jit_info_log);
-      event_log_info (hashcat_ctx, NULL);
-      #endif
+      device_param->kernel_exec_timeout = kernel_exec_timeout;
 
-      if (hc_cuModuleLoadDataEx (hashcat_ctx, cuda_module, cubin, mod_cnt, mod_opts, mod_vals) == -1)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s load failed. Error Log:", device_param->device_id + 1, source_file);
-        event_log_error (hashcat_ctx, "%s", mod_error_log);
-        event_log_error (hashcat_ctx, NULL);
+      // max_shared_memory_per_block
 
-        return false;
-      }
+      int max_shared_memory_per_block = 0;
 
-      #if defined (DEBUG)
-      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s load successful. Info Log:", device_param->device_id + 1, source_file);
-      event_log_info (hashcat_ctx, "%s", mod_info_log);
-      event_log_info (hashcat_ctx, NULL);
-      #endif
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &max_shared_memory_per_block, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, cuda_device) == -1) return -1;
 
-      if (cache_disable == false)
+      if (max_shared_memory_per_block < 32768)
       {
-        if (write_kernel_binary (hashcat_ctx, cached_file, cubin, cubin_size) == false) return false;
+        event_log_error (hashcat_ctx, "* Device #%u: This device's shared buffer size is too small.", device_id + 1);
+
+        device_param->skipped = true;
       }
 
-      if (hc_cuLinkDestroy (hashcat_ctx, state) == -1) return false;
+      device_param->device_local_mem_size = max_shared_memory_per_block;
 
-      hcfree (jit_info_log);
-      hcfree (jit_error_log);
+      // device_max_constant_buffer_size
 
-      #else
+      int device_max_constant_buffer_size = 0;
 
-      if (hc_cuModuleLoadDataEx (hashcat_ctx, cuda_module, binary, mod_cnt, mod_opts, mod_vals) == -1)
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_max_constant_buffer_size, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, cuda_device) == -1) return -1;
+
+      if (device_max_constant_buffer_size < 65536)
       {
-        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s load failed. Error Log:", device_param->device_id + 1, source_file);
-        event_log_error (hashcat_ctx, "%s", mod_error_log);
-        event_log_error (hashcat_ctx, NULL);
+        event_log_error (hashcat_ctx, "* Device #%u: This device's local mem size is too small.", device_id + 1);
 
-        return false;
+        device_param->skipped = true;
       }
 
-      #if defined (DEBUG)
-      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s load successful. Info Log:", device_param->device_id + 1, source_file);
-      event_log_info (hashcat_ctx, "%s", mod_info_log);
-      event_log_info (hashcat_ctx, NULL);
-      #endif
+      // some attributes have to be hardcoded because they are used for instance in the build options
 
-      if (cache_disable == false)
-      {
-        if (write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size) == false) return false;
-      }
+      device_param->device_local_mem_type     = CL_LOCAL;
+      device_param->opencl_device_type        = CL_DEVICE_TYPE_GPU;
+      device_param->opencl_device_vendor_id   = VENDOR_ID_NV;
+      device_param->opencl_platform_vendor_id = VENDOR_ID_NV;
 
-      #endif
+      // or in the cached kernel checksum
 
-      hcfree (mod_info_log);
-      hcfree (mod_error_log);
-
-      hcfree (binary);
-    }
-
-    if (device_param->is_opencl == true)
-    {
-      if (hc_clCreateProgramWithSource (hashcat_ctx, device_param->opencl_context, 1, (const char **) kernel_sources, NULL, opencl_program) == -1) return false;
-
-      const int CL_rc = hc_clBuildProgram (hashcat_ctx, *opencl_program, 1, &device_param->opencl_device, build_options_buf, NULL, NULL);
+      device_param->opencl_device_version     = "";
+      device_param->opencl_driver_version     = "";
 
-      //if (CL_rc == -1) return -1;
+      // or just to make sure they are not NULL
 
-      size_t build_log_size = 0;
+      device_param->opencl_device_vendor     = "";
+      device_param->opencl_device_c_version  = "";
 
-      hc_clGetProgramBuildInfo (hashcat_ctx, *opencl_program, device_param->opencl_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
+      // skipped
 
-      //if (CL_rc == -1) return -1;
+      if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0)
+      {
+        device_param->skipped = true;
+      }
 
-      #if defined (DEBUG)
-      if ((build_log_size > 1) || (CL_rc == -1))
-      #else
-      if (CL_rc == -1)
-      #endif
+      if ((backend_ctx->opencl_device_types_filter & CL_DEVICE_TYPE_GPU) == 0)
       {
-        char *build_log = (char *) hcmalloc (build_log_size + 1);
+        device_param->skipped = true;
+      }
 
-        const int rc_clGetProgramBuildInfo = hc_clGetProgramBuildInfo (hashcat_ctx, *opencl_program, device_param->opencl_device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
+      if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
+      {
+        need_nvml = true;
 
-        if (rc_clGetProgramBuildInfo == -1) return false;
+        #if defined (_WIN) || defined (__CYGWIN__)
+        need_nvapi = true;
+        #endif
+      }
 
-        puts (build_log);
+      // CPU burning loop damper
+      // Value is given as number between 0-100
+      // By default 8%
+      // in theory not needed with CUDA
 
-        hcfree (build_log);
-      }
+      device_param->spin_damp = (double) user_options->spin_damp / 100;
 
-      if (CL_rc == -1) return false;
+      // common driver check
 
-      if (cache_disable == false)
+      if (device_param->skipped == false)
       {
-        size_t binary_size;
-
-        if (hc_clGetProgramInfo (hashcat_ctx, *opencl_program, CL_PROGRAM_BINARY_SIZES, sizeof (size_t), &binary_size, NULL) == -1) return false;
+        if ((user_options->force == false) && (user_options->backend_info == false))
+        {
+          // CUDA does not support query nvidia driver version, therefore no driver checks here
+          // IF needed, could be retrieved using nvmlSystemGetDriverVersion()
 
-        char *binary = (char *) hcmalloc (binary_size);
+          if (device_param->sm_major < 5)
+          {
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: This hardware has outdated CUDA compute capability (%u.%u).", device_id + 1, device_param->sm_major, device_param->sm_minor);
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             For modern OpenCL performance, upgrade to hardware that supports");
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             CUDA compute capability version 5.0 (Maxwell) or higher.");
+          }
 
-        if (hc_clGetProgramInfo (hashcat_ctx, *opencl_program, CL_PROGRAM_BINARIES, sizeof (char *), &binary, NULL) == -1) return false;
+          if (device_param->kernel_exec_timeout != 0)
+          {
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: WARNING! Kernel exec timeout is not disabled.", device_id + 1);
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             This may cause \"CL_OUT_OF_RESOURCES\" or related errors.");
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             To disable the timeout, see: https://hashcat.net/q/timeoutpatch");
+          }
+        }
 
-        if (write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size) == false) return false;
+        /**
+         * activate device
+         */
 
-        hcfree (binary);
+        cuda_devices_active++;
       }
-    }
-  }
-  else
-  {
-    if (read_kernel_binary (hashcat_ctx, cached_file, kernel_lengths, kernel_sources) == false) return false;
-
-    if (device_param->is_cuda == true)
-    {
-      #define LOG_SIZE 8192
-
-      char *mod_info_log  = (char *) hcmalloc (LOG_SIZE + 1);
-      char *mod_error_log = (char *) hcmalloc (LOG_SIZE + 1);
-
-      int mod_cnt = 6;
 
-      CUjit_option mod_opts[7];
-      void *mod_vals[7];
+      CUcontext cuda_context;
 
-      mod_opts[0] = CU_JIT_TARGET_FROM_CUCONTEXT;
-      mod_vals[0] = (void *) 0;
+      if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
 
-      mod_opts[1] = CU_JIT_LOG_VERBOSE;
-      mod_vals[1] = (void *) 1;
+      if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1;
 
-      mod_opts[2] = CU_JIT_INFO_LOG_BUFFER;
-      mod_vals[2] = (void *) mod_info_log;
+      // bcrypt optimization?
+      //const int rc_cuCtxSetCacheConfig = hc_cuCtxSetCacheConfig (hashcat_ctx, CU_FUNC_CACHE_PREFER_SHARED);
+      //
+      //if (rc_cuCtxSetCacheConfig == -1) return -1;
 
-      mod_opts[3] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
-      mod_vals[3] = (void *) LOG_SIZE;
+      const int sm = (device_param->sm_major * 10) + device_param->sm_minor;
 
-      mod_opts[4] = CU_JIT_ERROR_LOG_BUFFER;
-      mod_vals[4] = (void *) mod_error_log;
+      device_param->has_add   = (sm >= 12) ? true : false;
+      device_param->has_addc  = (sm >= 12) ? true : false;
+      device_param->has_sub   = (sm >= 12) ? true : false;
+      device_param->has_subc  = (sm >= 12) ? true : false;
+      device_param->has_bfe   = (sm >= 20) ? true : false;
+      device_param->has_lop3  = (sm >= 50) ? true : false;
+      device_param->has_mov64 = (sm >= 10) ? true : false;
+      device_param->has_prmt  = (sm >= 20) ? true : false;
 
-      mod_opts[5] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
-      mod_vals[5] = (void *) LOG_SIZE;
+      /*
+      #define RUN_INSTRUCTION_CHECKS()                                                                                                                                                                                                                      \
+        device_param->has_add   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                              \
+        device_param->has_addc  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                             \
+        device_param->has_sub   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                              \
+        device_param->has_subc  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                             \
+        device_param->has_bfe   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                              \
+        device_param->has_lop3  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");                                                          \
+        device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");  \
+        device_param->has_prmt  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                             \
 
-      if (hashconfig->opti_type & OPTI_TYPE_REGISTER_LIMIT)
+      if (backend_devices_idx > 0)
       {
-        mod_opts[6] = CU_JIT_MAX_REGISTERS;
-        mod_vals[6] = (void *) 128;
+        hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
 
-        mod_cnt++;
+        if (is_same_device_type (device_param, device_param_prev) == true)
+        {
+          device_param->has_add   = device_param_prev->has_add;
+          device_param->has_addc  = device_param_prev->has_addc;
+          device_param->has_sub   = device_param_prev->has_sub;
+          device_param->has_subc  = device_param_prev->has_subc;
+          device_param->has_bfe   = device_param_prev->has_bfe;
+          device_param->has_lop3  = device_param_prev->has_lop3;
+          device_param->has_mov64 = device_param_prev->has_mov64;
+          device_param->has_prmt  = device_param_prev->has_prmt;
+        }
+        else
+        {
+          RUN_INSTRUCTION_CHECKS();
+        }
       }
-
-      if (hc_cuModuleLoadDataEx (hashcat_ctx, cuda_module, kernel_sources[0], mod_cnt, mod_opts, mod_vals) == -1)
+      else
       {
-        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s load failed. Error Log:", device_param->device_id + 1, source_file);
-        event_log_error (hashcat_ctx, "%s", mod_error_log);
-        event_log_error (hashcat_ctx, NULL);
-
-        return false;
+        RUN_INSTRUCTION_CHECKS();
       }
 
-      #if defined (DEBUG)
-      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s load successful. Info Log:", device_param->device_id + 1, source_file);
-      event_log_info (hashcat_ctx, "%s", mod_info_log);
-      event_log_info (hashcat_ctx, NULL);
-      #endif
+      #undef RUN_INSTRUCTION_CHECKS
+      */
 
-      hcfree (mod_info_log);
-      hcfree (mod_error_log);
-    }
+      // device_available_mem
 
-    if (device_param->is_opencl == true)
-    {
-      if (hc_clCreateProgramWithBinary (hashcat_ctx, device_param->opencl_context, 1, &device_param->opencl_device, kernel_lengths, (const unsigned char **) kernel_sources, NULL, opencl_program) == -1) return false;
+      size_t free  = 0;
+      size_t total = 0;
 
-      if (hc_clBuildProgram (hashcat_ctx, *opencl_program, 1, &device_param->opencl_device, build_options_buf, NULL, NULL) == -1) return false;
+      if (hc_cuMemGetInfo (hashcat_ctx, &free, &total) == -1) return -1;
+
+      device_param->device_available_mem = (u64) free;
+
+      if (hc_cuCtxDestroy (hashcat_ctx, cuda_context) == -1) return -1;
     }
   }
 
-  hcfree (kernel_sources[0]);
+  backend_ctx->cuda_devices_cnt     = cuda_devices_cnt;
+  backend_ctx->cuda_devices_active  = cuda_devices_active;
 
-  return true;
-}
+  /*
+  * HIP
+  */
 
-int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
-{
-  const bitmap_ctx_t         *bitmap_ctx          = hashcat_ctx->bitmap_ctx;
-  const folder_config_t      *folder_config       = hashcat_ctx->folder_config;
-  const hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
-  const hashes_t             *hashes              = hashcat_ctx->hashes;
-  const module_ctx_t         *module_ctx          = hashcat_ctx->module_ctx;
-        backend_ctx_t        *backend_ctx         = hashcat_ctx->backend_ctx;
-  const straight_ctx_t       *straight_ctx        = hashcat_ctx->straight_ctx;
-  const user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
-  const user_options_t       *user_options        = hashcat_ctx->user_options;
+  int hip_devices_cnt    = 0;
+  int hip_devices_active = 0;
+  if (backend_ctx->hip)
+  {
+    // device count
 
-  if (backend_ctx->enabled == false) return 0;
+    if (hc_hipDeviceGetCount (hashcat_ctx, &hip_devices_cnt) == -1)
+    {
+      hip_close (hashcat_ctx);
+    }
 
-  u64 size_total_host_all = 0;
+    backend_ctx->hip_devices_cnt = hip_devices_cnt;
 
-  u32 hardware_power_all = 0;
+    // device specific
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
-  {
-    /**
-     * host buffer
-     */
+    for (int hip_devices_idx = 0; hip_devices_idx < hip_devices_cnt; hip_devices_idx++, backend_devices_idx++)
+    {
+      const u32 device_id = backend_devices_idx;
 
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+      hc_device_param_t *device_param = &devices_param[backend_devices_idx];
 
-    if (device_param->skipped == true) continue;
+      device_param->device_id = device_id;
 
-    EVENT_DATA (EVENT_BACKEND_DEVICE_INIT_PRE, &backend_devices_idx, sizeof (int));
+      backend_ctx->backend_device_from_hip[hip_devices_idx] = backend_devices_idx;
 
-    const int device_id = device_param->device_id;
+      HIPdevice hip_device;
 
-    /**
-     * module depending checks
-     */
+      if (hc_hipDeviceGet (hashcat_ctx, &hip_device, hip_devices_idx) == -1) return -1;
 
-    device_param->skipped_warning = false;
+      device_param->hip_device = hip_device;
 
-    if (module_ctx->module_unstable_warning != MODULE_DEFAULT)
-    {
-      const bool unstable_warning = module_ctx->module_unstable_warning (hashconfig, user_options, user_options_extra, device_param);
+      device_param->is_hip = true;
 
-      if ((unstable_warning == true) && (user_options->force == false))
-      {
-        event_log_warning (hashcat_ctx, "* Device #%u: Skipping hash-mode %u - known CUDA/OpenCL Runtime/Driver issue (not a hashcat issue)", device_id + 1, hashconfig->hash_mode);
-        event_log_warning (hashcat_ctx, "             You can use --force to override, but do not report related errors.");
+      device_param->is_opencl = false;
 
-        device_param->skipped_warning = true;
+      device_param->use_opencl12 = false;
+      device_param->use_opencl20 = false;
+      device_param->use_opencl21 = false;
 
-        continue;
-      }
-    }
+      // device_name
 
-    // vector_width
+      char *device_name = (char *) hcmalloc (HCBUFSIZ_TINY);
 
-    int vector_width = 0;
+      if (hc_hipDeviceGetName (hashcat_ctx, device_name, HCBUFSIZ_TINY, hip_device) == -1) return -1;
 
-    if (user_options->backend_vector_width_chgd == false)
-    {
-      // tuning db
+      device_param->device_name = device_name;
 
-      tuning_db_entry_t *tuningdb_entry;
+      hc_string_trim_leading (device_name);
 
-      if (user_options->slow_candidates == true)
-      {
-        tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, 0, hashconfig->hash_mode);
-      }
-      else
-      {
-        tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
-      }
+      hc_string_trim_trailing (device_name);
 
-      if (tuningdb_entry == NULL || tuningdb_entry->vector_width == -1)
-      {
-        if (hashconfig->opti_type & OPTI_TYPE_USES_BITS_64)
-        {
-          if (device_param->is_cuda == true)
-          {
-            // cuda does not support this query
+      // device_processors
 
-            vector_width = 1;
-          }
+      int device_processors = 0;
 
-          if (device_param->is_opencl == true)
-          {
-            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, sizeof (vector_width), &vector_width, NULL) == -1) return -1;
-          }
-        }
-        else
-        {
-          if (device_param->is_cuda == true)
-          {
-            // cuda does not support this query
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &device_processors, HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, hip_device) == -1) return -1;
 
-            vector_width = 1;
-          }
+      device_param->device_processors = device_processors;
 
-          if (device_param->is_opencl == true)
-          {
-            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT,  sizeof (vector_width), &vector_width, NULL) == -1) return -1;
-          }
-        }
-      }
-      else
-      {
-        vector_width = (cl_uint) tuningdb_entry->vector_width;
-      }
-    }
-    else
-    {
-      vector_width = user_options->backend_vector_width;
-    }
+      // device_global_mem, device_maxmem_alloc, device_available_mem
 
-    // We can't have SIMD in kernels where we have an unknown final password length
-    // It also turns out that pure kernels (that have a higher register pressure)
-    // actually run faster on scalar GPU (like 1080) without SIMD
+      size_t bytes = 0;
 
-    if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
-    {
-      if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-      {
-        vector_width = 1;
-      }
-    }
+      if (hc_hipDeviceTotalMem (hashcat_ctx, &bytes, hip_device) == -1) return -1;
 
-    if (vector_width > 16) vector_width = 16;
+      device_param->device_global_mem = (u64) bytes;
 
-    device_param->vector_width = vector_width;
+      device_param->device_maxmem_alloc = (u64) bytes;
 
-    /**
-     * kernel accel and loops tuning db adjustment
-     */
+      device_param->device_available_mem = 0;
 
-    device_param->kernel_accel_min   = hashconfig->kernel_accel_min;
-    device_param->kernel_accel_max   = hashconfig->kernel_accel_max;
-    device_param->kernel_loops_min   = hashconfig->kernel_loops_min;
-    device_param->kernel_loops_max   = hashconfig->kernel_loops_max;
-    device_param->kernel_threads_min = hashconfig->kernel_threads_min;
-    device_param->kernel_threads_max = hashconfig->kernel_threads_max;
+      // warp size
 
-    tuning_db_entry_t *tuningdb_entry = NULL;
+      int hip_warp_size = 0;
 
-    if (user_options->slow_candidates == true)
-    {
-      tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, 0, hashconfig->hash_mode);
-    }
-    else
-    {
-      tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
-    }
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &hip_warp_size, HIP_DEVICE_ATTRIBUTE_WARP_SIZE, hip_device) == -1) return -1;
 
-    // user commandline option override tuning db
-    // but both have to stay inside the boundaries of the module
+      device_param->hip_warp_size = hip_warp_size;
 
-    if (user_options->kernel_accel_chgd == true)
-    {
-      const u32 _kernel_accel = user_options->kernel_accel;
+      // sm_minor, sm_major
 
-      if ((_kernel_accel >= device_param->kernel_accel_min) && (_kernel_accel <= device_param->kernel_accel_max))
-      {
-        device_param->kernel_accel_min = _kernel_accel;
-        device_param->kernel_accel_max = _kernel_accel;
-      }
-    }
-    else
-    {
-      if (tuningdb_entry != NULL)
-      {
-        const u32 _kernel_accel = tuningdb_entry->kernel_accel;
+      int sm_major = 0;
+      int sm_minor = 0;
 
-        if (_kernel_accel)
-        {
-          if ((_kernel_accel >= device_param->kernel_accel_min) && (_kernel_accel <= device_param->kernel_accel_max))
-          {
-            device_param->kernel_accel_min = _kernel_accel;
-            device_param->kernel_accel_max = _kernel_accel;
-          }
-        }
-      }
-    }
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &sm_major, HIP_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hip_device) == -1) return -1;
 
-    if (user_options->kernel_loops_chgd == true)
-    {
-      const u32 _kernel_loops = user_options->kernel_loops;
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &sm_minor, HIP_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hip_device) == -1) return -1;
 
-      if ((_kernel_loops >= device_param->kernel_loops_min) && (_kernel_loops <= device_param->kernel_loops_max))
-      {
-        device_param->kernel_loops_min = _kernel_loops;
-        device_param->kernel_loops_max = _kernel_loops;
-      }
-    }
-    else
-    {
-      if (tuningdb_entry != NULL)
-      {
-        u32 _kernel_loops = tuningdb_entry->kernel_loops;
+      device_param->sm_major = sm_major;
+      device_param->sm_minor = sm_minor;
 
-        if (_kernel_loops)
-        {
-          if (user_options->workload_profile == 1)
-          {
-            _kernel_loops = (_kernel_loops > 8) ? _kernel_loops / 8 : 1;
-          }
-          else if (user_options->workload_profile == 2)
-          {
-            _kernel_loops = (_kernel_loops > 4) ? _kernel_loops / 4 : 1;
-          }
+      // device_maxworkgroup_size
 
-          if ((_kernel_loops >= device_param->kernel_loops_min) && (_kernel_loops <= device_param->kernel_loops_max))
-          {
-            device_param->kernel_loops_min = _kernel_loops;
-            device_param->kernel_loops_max = _kernel_loops;
-          }
-        }
-      }
-    }
+      int device_maxworkgroup_size = 0;
 
-    // there's no thread column in tuning db, stick to commandline if defined
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &device_maxworkgroup_size, HIP_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, hip_device) == -1) return -1;
 
-    if (user_options->kernel_threads_chgd == true)
-    {
-      const u32 _kernel_threads = user_options->kernel_threads;
+      device_param->device_maxworkgroup_size = device_maxworkgroup_size;
 
-      if ((_kernel_threads >= device_param->kernel_threads_min) && (_kernel_threads <= device_param->kernel_threads_max))
-      {
-        device_param->kernel_threads_min = _kernel_threads;
-        device_param->kernel_threads_max = _kernel_threads;
-      }
-    }
+      // max_clock_frequency
 
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      // we have some absolute limits for fast hashes (because of limit constant memory), make sure not to overstep
+      int device_maxclock_frequency = 0;
 
-      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-      {
-        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-        {
-          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_RULES);
-          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_RULES);
-        }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
-        {
-          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_COMBS);
-          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_COMBS);
-        }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
-        {
-          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_BFS);
-          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_BFS);
-        }
-      }
-    }
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &device_maxclock_frequency, HIP_DEVICE_ATTRIBUTE_CLOCK_RATE, hip_device) == -1) return -1;
 
-    device_param->kernel_loops_min_sav = device_param->kernel_loops_min;
-    device_param->kernel_loops_max_sav = device_param->kernel_loops_max;
+      device_param->device_maxclock_frequency = device_maxclock_frequency / 1000;
 
-    /**
-     * device properties
-     */
+      // pcie_bus, pcie_device, pcie_function
 
-    const u32 device_processors = device_param->device_processors;
+      int pci_domain_id_nv  = 0;
+      int pci_bus_id_nv     = 0;
+      int pci_slot_id_nv    = 0;
 
-    /**
-     * create context for each device
-     */
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &pci_domain_id_nv, HIP_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, hip_device) == -1) return -1;
 
-    if (device_param->is_cuda == true)
-    {
-      if (hc_cuCtxCreate (hashcat_ctx, &device_param->cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
-    }
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &pci_bus_id_nv, HIP_DEVICE_ATTRIBUTE_PCI_BUS_ID, hip_device) == -1) return -1;
 
-    if (device_param->is_opencl == true)
-    {
-      /*
-      cl_context_properties properties[3];
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &pci_slot_id_nv, HIP_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, hip_device) == -1) return -1;
 
-      properties[0] = CL_CONTEXT_PLATFORM;
-      properties[1] = (cl_context_properties) device_param->opencl_platform;
-      properties[2] = 0;
+      device_param->pcie_domain   = (u8) (pci_domain_id_nv);
+      device_param->pcie_bus      = (u8) (pci_bus_id_nv);
+      device_param->pcie_device   = (u8) (pci_slot_id_nv >> 3);
+      device_param->pcie_function = (u8) (pci_slot_id_nv & 7);
 
-      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context);
-      */
+      // kernel_exec_timeout
 
-      if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context) == -1) return -1;
+      int kernel_exec_timeout = 0;
 
-      /**
-       * create command-queue
-       */
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &kernel_exec_timeout, HIP_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, hip_device) == -1) return -1;
 
-      // not supported with NV
-      // device_param->opencl_command_queue = hc_clCreateCommandQueueWithProperties (hashcat_ctx, device_param->opencl_device, NULL);
+      device_param->kernel_exec_timeout = kernel_exec_timeout;
 
-      if (hc_clCreateCommandQueue (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, CL_QUEUE_PROFILING_ENABLE, &device_param->opencl_command_queue) == -1) return -1;
-    }
+      // max_shared_memory_per_block
 
-    /**
-     * create stream for CUDA devices
-     */
+      int max_shared_memory_per_block = 0;
 
-    if (device_param->is_cuda == true)
-    {
-      if (hc_cuStreamCreate (hashcat_ctx, &device_param->cuda_stream, CU_STREAM_DEFAULT) == -1) return -1;
-    }
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &max_shared_memory_per_block, HIP_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, hip_device) == -1) return -1;
 
-    /**
-     * create events for CUDA devices
-     */
+      if (max_shared_memory_per_block < 32768)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: This device's shared buffer size is too small.", device_id + 1);
 
-    if (device_param->is_cuda == true)
-    {
-      if (hc_cuEventCreate (hashcat_ctx, &device_param->cuda_event1, CU_EVENT_DEFAULT) == -1) return -1;
+        device_param->skipped = true;
+      }
 
-      if (hc_cuEventCreate (hashcat_ctx, &device_param->cuda_event2, CU_EVENT_DEFAULT) == -1) return -1;
-    }
+      device_param->device_local_mem_size = max_shared_memory_per_block;
 
-    /**
-     * create input buffers on device : calculate size of fixed memory buffers
-     */
+      // device_max_constant_buffer_size
 
-    u64 size_root_css   = SP_PW_MAX *           sizeof (cs_t);
-    u64 size_markov_css = SP_PW_MAX * CHARSIZ * sizeof (cs_t);
-
-    device_param->size_root_css   = size_root_css;
-    device_param->size_markov_css = size_markov_css;
-
-    u64 size_results = sizeof (u32);
+      int device_max_constant_buffer_size = 0;
 
-    device_param->size_results = size_results;
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &device_max_constant_buffer_size, HIP_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, hip_device) == -1) return -1;
+      device_max_constant_buffer_size = 65536;
+      if (device_max_constant_buffer_size < 65536)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: This device's local mem size is too small.", device_id + 1);
 
-    u64 size_rules   = (u64) straight_ctx->kernel_rules_cnt * sizeof (kernel_rule_t);
-    u64 size_rules_c = (u64) KERNEL_RULES                   * sizeof (kernel_rule_t);
+        device_param->skipped = true;
+      }
 
-    device_param->size_rules    = size_rules;
-    device_param->size_rules_c  = size_rules_c;
+      // some attributes have to be hardcoded because they are used for instance in the build options
 
-    u64 size_plains  = (u64) hashes->digests_cnt * sizeof (plain_t);
-    u64 size_salts   = (u64) hashes->salts_cnt   * sizeof (salt_t);
-    u64 size_esalts  = (u64) hashes->digests_cnt * hashconfig->esalt_size;
-    u64 size_shown   = (u64) hashes->digests_cnt * sizeof (u32);
-    u64 size_digests = (u64) hashes->digests_cnt * (u64) hashconfig->dgst_size;
+      device_param->device_local_mem_type     = CL_LOCAL;
+      device_param->opencl_device_type        = CL_DEVICE_TYPE_GPU;
+      device_param->opencl_device_vendor_id   = VENDOR_ID_NV;
+      device_param->opencl_platform_vendor_id = VENDOR_ID_NV;
 
-    device_param->size_plains   = size_plains;
-    device_param->size_digests  = size_digests;
-    device_param->size_shown    = size_shown;
-    device_param->size_salts    = size_salts;
-    device_param->size_esalts   = size_esalts;
+      // or in the cached kernel checksum
 
-    u64 size_combs = KERNEL_COMBS * sizeof (pw_t);
-    u64 size_bfs   = KERNEL_BFS   * sizeof (bf_t);
-    u64 size_tm    = 32           * sizeof (bs_word_t);
+      device_param->opencl_device_version     = "";
+      device_param->opencl_driver_version     = "";
 
-    device_param->size_bfs      = size_bfs;
-    device_param->size_combs    = size_combs;
-    device_param->size_tm       = size_tm;
+      // or just to make sure they are not NULL
 
-    u64 size_st_digests = 1 * hashconfig->dgst_size;
-    u64 size_st_salts   = 1 * sizeof (salt_t);
-    u64 size_st_esalts  = 1 * hashconfig->esalt_size;
+      device_param->opencl_device_vendor     = "";
+      device_param->opencl_device_c_version  = "";
 
-    device_param->size_st_digests = size_st_digests;
-    device_param->size_st_salts   = size_st_salts;
-    device_param->size_st_esalts  = size_st_esalts;
+      // skipped
 
-    u64 size_extra_buffer = 4;
+      if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0)
+      {
+        device_param->skipped = true;
+      }
 
-    if (module_ctx->module_extra_buffer_size != MODULE_DEFAULT)
-    {
-      const u64 extra_buffer_size = module_ctx->module_extra_buffer_size (hashconfig, user_options, user_options_extra, hashes, device_param);
+      if ((backend_ctx->opencl_device_types_filter & CL_DEVICE_TYPE_GPU) == 0)
+      {
+        device_param->skipped = true;
+      }
 
-      if (extra_buffer_size == (u64) -1)
+      if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
       {
-        event_log_error (hashcat_ctx, "Invalid extra buffer size.");
+        need_nvml = true;
 
-        return -1;
+        #if defined (_WIN) || defined (__CYGWIN__)
+        need_nvapi = true;
+        #endif
       }
 
-      device_param->extra_buffer_size = extra_buffer_size;
-
-      size_extra_buffer = extra_buffer_size;
-    }
+      // CPU burning loop damper
+      // Value is given as number between 0-100
+      // By default 8%
+      // in theory not needed with HIP
 
-    // kern type
+      device_param->spin_damp = (double) user_options->spin_damp / 100;
 
-    u32 kern_type = hashconfig->kern_type;
+      // common driver check
 
-    if (module_ctx->module_kern_type_dynamic != MODULE_DEFAULT)
-    {
-      if (user_options->benchmark == true)
-      {
-      }
-      else
+      if (device_param->skipped == false)
       {
-        void        *digests_buf    = hashes->digests_buf;
-        salt_t      *salts_buf      = hashes->salts_buf;
-        void        *esalts_buf     = hashes->esalts_buf;
-        void        *hook_salts_buf = hashes->hook_salts_buf;
-        hashinfo_t **hash_info      = hashes->hash_info;
+        if ((user_options->force == false) && (user_options->backend_info == false))
+        {
+          if (device_param->sm_major < 5)
+          {
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: This hardware has outdated HIP compute capability (%u.%u).", device_id + 1, device_param->sm_major, device_param->sm_minor);
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             For modern OpenCL performance, upgrade to hardware that supports");
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             HIP compute capability version 4.2 or higher.");
+          }
 
-        hashinfo_t *hash_info_ptr = NULL;
+          if (device_param->kernel_exec_timeout != 0)
+          {
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: WARNING! Kernel exec timeout is not disabled.", device_id + 1);
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             This may cause \"CL_OUT_OF_RESOURCES\" or related errors.");
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             To disable the timeout, see: https://hashcat.net/q/timeoutpatch");
+          }
+        }
 
-        if (hash_info) hash_info_ptr = hash_info[0];
+        /**
+         * activate device
+         */
 
-        kern_type = (u32) module_ctx->module_kern_type_dynamic (hashconfig, digests_buf, salts_buf, esalts_buf, hook_salts_buf, hash_info_ptr);
+        hip_devices_active++;
       }
-    }
 
-    // built options
+      HIPcontext hip_context;
 
-    const size_t build_options_sz = 4096;
+      if (hc_hipCtxCreate (hashcat_ctx, &hip_context, HIP_CTX_SCHED_BLOCKING_SYNC, device_param->hip_device) == -1) return -1;
 
-    char *build_options_buf = (char *) hcmalloc (build_options_sz);
+      if (hc_hipCtxSetCurrent (hashcat_ctx, hip_context) == -1) return -1;
 
-    int build_options_len = 0;
+      // bcrypt optimization?
+      //const int rc_hipCtxSetCacheConfig = hc_hipCtxSetCacheConfig (hashcat_ctx, HIP_FUNC_CACHE_PREFER_SHARED);
+      //
+      //if (rc_hipCtxSetCacheConfig == -1) return -1;
 
-    #if defined (_WIN)
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D KERNEL_STATIC -I OpenCL -I \"%s\" ", folder_config->cpath_real);
-    #else
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D KERNEL_STATIC -I OpenCL -I %s ", folder_config->cpath_real);
-    #endif
+      const int sm = (device_param->sm_major * 10) + device_param->sm_minor;
 
-    /* currently disabled, hangs NEO drivers since 20.09.
-       was required for NEO driver 20.08 to workaround the same issue!
-       we go with the latest version
+      device_param->has_add   = (sm >= 12) ? true : false;
+      device_param->has_addc  = (sm >= 12) ? false : false;
+      device_param->has_sub   = (sm >= 12) ? true : false;
+      device_param->has_subc  = (sm >= 12) ? false : false;
+      device_param->has_bfe   = (sm >= 20) ? true : false;
+      device_param->has_lop3  = (sm >= 50) ? true : false;
+      device_param->has_mov64 = (sm >= 10) ? true : false;
+      device_param->has_prmt  = (sm >= 20) ? true : false;
 
-    if (device_param->is_opencl == true)
-    {
-      if (device_param->use_opencl12 == true)
-      {
-        build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-cl-std=CL1.2 ");
-      }
-      else if (device_param->use_opencl20 == true)
+      /*
+      #define RUN_INSTRUCTION_CHECKS()                                                                                                                                                                                                                      \
+        device_param->has_add   = hip_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                              \
+        device_param->has_addc  = hip_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                             \
+        device_param->has_sub   = hip_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                              \
+        device_param->has_subc  = hip_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                             \
+        device_param->has_bfe   = hip_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                              \
+        device_param->has_lop3  = hip_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");                                                          \
+        device_param->has_mov64 = hip_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");  \
+        device_param->has_prmt  = hip_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                             \
+
+      if (backend_devices_idx > 0)
       {
-        build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-cl-std=CL2.0 ");
+        hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
+
+        if (is_same_device_type (device_param, device_param_prev) == true)
+        {
+          device_param->has_add   = device_param_prev->has_add;
+          device_param->has_addc  = device_param_prev->has_addc;
+          device_param->has_sub   = device_param_prev->has_sub;
+          device_param->has_subc  = device_param_prev->has_subc;
+          device_param->has_bfe   = device_param_prev->has_bfe;
+          device_param->has_lop3  = device_param_prev->has_lop3;
+          device_param->has_mov64 = device_param_prev->has_mov64;
+          device_param->has_prmt  = device_param_prev->has_prmt;
+        }
+        else
+        {
+          RUN_INSTRUCTION_CHECKS();
+        }
       }
-      else if (device_param->use_opencl21 == true)
+      else
       {
-        build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-cl-std=CL2.1 ");
+        RUN_INSTRUCTION_CHECKS();
       }
-    }
-    */
 
-    // we don't have sm_* on vendors not NV but it doesn't matter
+      #undef RUN_INSTRUCTION_CHECKS
+      */
 
-    #if defined (DEBUG)
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
-    #else
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
-    #endif
+      // device_available_mem
 
-    build_options_buf[build_options_len] = 0;
+      size_t free  = 0;
+      size_t total = 0;
 
-    /*
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
-    {
-      if (device_param->opencl_platform_vendor_id == VENDOR_ID_INTEL_SDK)
-      {
-        strncat (build_options_buf, " -cl-opt-disable", 16);
-      }
+      if (hc_hipMemGetInfo (hashcat_ctx, &free, &total) == -1) return -1;
+
+      device_param->device_available_mem = (u64) free;
+
+      if (hc_hipCtxDestroy (hashcat_ctx, hip_context) == -1) return -1;
     }
-    */
+  }
 
-    #if defined (DEBUG)
-    if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_options '%s'", device_id + 1, build_options_buf);
-    #endif
+  backend_ctx->hip_devices_cnt     = hip_devices_cnt;
+  backend_ctx->hip_devices_active  = hip_devices_active;
+
+  /*
+  * OpenCL
+  */
+  int opencl_devices_cnt    = 0;
+  int opencl_devices_active = 0;
 
+  if (backend_ctx->ocl)
+  {
     /**
-     * device_name_chksum
+     * OpenCL devices: simply push all devices from all platforms into the same device array
      */
 
-    char *device_name_chksum        = (char *) hcmalloc (HCBUFSIZ_TINY);
-    char *device_name_chksum_amp_mp = (char *) hcmalloc (HCBUFSIZ_TINY);
+    cl_uint         opencl_platforms_cnt         = backend_ctx->opencl_platforms_cnt;
+    cl_device_id  **opencl_platforms_devices     = backend_ctx->opencl_platforms_devices;
+    cl_uint        *opencl_platforms_devices_cnt = backend_ctx->opencl_platforms_devices_cnt;
+    cl_uint        *opencl_platforms_vendor_id   = backend_ctx->opencl_platforms_vendor_id;
+    char          **opencl_platforms_version     = backend_ctx->opencl_platforms_version;
 
-    const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%u-%s-%s-%s-%d-%u",
-      backend_ctx->comptime,
-      backend_ctx->cuda_driver_version,
-      device_param->is_opencl,
-      device_param->opencl_platform_vendor_id,
-      device_param->device_name,
-      device_param->opencl_device_version,
-      device_param->opencl_driver_version,
-      device_param->vector_width,
-      hashconfig->kern_type);
+    for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
+    {
+      cl_device_id   *opencl_platform_devices     = opencl_platforms_devices[opencl_platforms_idx];
+      cl_uint         opencl_platform_devices_cnt = opencl_platforms_devices_cnt[opencl_platforms_idx];
+      cl_uint         opencl_platform_vendor_id   = opencl_platforms_vendor_id[opencl_platforms_idx];
+      char           *opencl_platform_version     = opencl_platforms_version[opencl_platforms_idx];
 
-    const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%u-%s-%s-%s",
-      backend_ctx->comptime,
-      backend_ctx->cuda_driver_version,
-      device_param->is_opencl,
-      device_param->opencl_platform_vendor_id,
-      device_param->device_name,
-      device_param->opencl_device_version,
-      device_param->opencl_driver_version);
+      for (u32 opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++, backend_devices_idx++, opencl_devices_cnt++)
+      {
+        const u32 device_id = backend_devices_idx;
 
-    md5_ctx_t md5_ctx;
+        hc_device_param_t *device_param = &devices_param[device_id];
 
-    md5_init   (&md5_ctx);
-    md5_update (&md5_ctx, (u32 *) device_name_chksum, dnclen);
-    md5_final  (&md5_ctx);
+        device_param->device_id = device_id;
 
-    snprintf (device_name_chksum, HCBUFSIZ_TINY, "%08x", md5_ctx.h[0]);
+        backend_ctx->backend_device_from_opencl[opencl_devices_cnt] = backend_devices_idx;
 
-    md5_init   (&md5_ctx);
-    md5_update (&md5_ctx, (u32 *) device_name_chksum_amp_mp, dnclen_amp_mp);
-    md5_final  (&md5_ctx);
+        backend_ctx->backend_device_from_opencl_platform[opencl_platforms_idx][opencl_platform_devices_idx] = backend_devices_idx;
 
-    snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%08x", md5_ctx.h[0]);
+        device_param->opencl_platform_vendor_id = opencl_platform_vendor_id;
 
-    /**
-     * kernel cache
-     */
+        device_param->opencl_device = opencl_platform_devices[opencl_platform_devices_idx];
 
-    bool cache_disable = false;
+        //device_param->opencl_platform = opencl_platform;
 
-    // Seems to be completely broken on Apple + (Intel?) CPU
-    // To reproduce set cache_disable to false and run benchmark -b
+        device_param->is_cuda = false;
 
-    if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
-    {
-      if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
-      {
-        cache_disable = true;
-      }
-    }
+        device_param->is_opencl = true;
 
-    if (module_ctx->module_jit_cache_disable != MODULE_DEFAULT)
-    {
-      cache_disable = module_ctx->module_jit_cache_disable (hashconfig, user_options, user_options_extra, hashes, device_param);
-    }
+        // store opencl platform i
 
-    /**
-     * shared kernel with no hashconfig dependencies
-     */
+        device_param->opencl_platform_id = opencl_platforms_idx;
 
-    {
-      /**
-       * kernel shared source filename
-       */
+        // check OpenCL version
 
-      char source_file[256] = { 0 };
+        device_param->use_opencl12 = false;
+        device_param->use_opencl20 = false;
+        device_param->use_opencl21 = false;
 
-      generate_source_kernel_shared_filename (folder_config->shared_dir, source_file);
+        int opencl_version_min = 0;
+        int opencl_version_maj = 0;
 
-      if (hc_path_read (source_file) == false)
-      {
-        event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
+        if (sscanf (opencl_platform_version, "OpenCL %d.%d", &opencl_version_min, &opencl_version_maj) == 2)
+        {
+          if ((opencl_version_min == 1) && (opencl_version_maj == 2))
+          {
+            device_param->use_opencl12 = true;
+          }
+          else if ((opencl_version_min == 2) && (opencl_version_maj == 0))
+          {
+            device_param->use_opencl20 = true;
+          }
+          else if ((opencl_version_min == 2) && (opencl_version_maj == 1))
+          {
+            device_param->use_opencl21 = true;
+          }
+        }
 
-        return -1;
-      }
+        size_t param_value_size = 0;
 
-      /**
-       * kernel shared cached filename
-       */
+        // opencl_device_type
 
-      char cached_file[256] = { 0 };
+        cl_device_type opencl_device_type;
 
-      generate_cached_kernel_shared_filename (folder_config->profile_dir, device_name_chksum_amp_mp, cached_file);
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_TYPE, sizeof (opencl_device_type), &opencl_device_type, NULL) == -1) return -1;
 
-      const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "shared_kernel", source_file, cached_file, build_options_buf, cache_disable, &device_param->opencl_program_shared, &device_param->cuda_module_shared);
+        opencl_device_type &= ~CL_DEVICE_TYPE_DEFAULT;
 
-      if (rc_load_kernel == false)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
+        device_param->opencl_device_type = opencl_device_type;
 
-        return -1;
-      }
+        // device_name
 
-      if (device_param->is_cuda == true)
-      {
-        // GPU memset
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NAME, 0, NULL, &param_value_size) == -1) return -1;
 
-        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_memset, device_param->cuda_module_shared, "gpu_memset") == -1) return -1;
+        char *device_name = (char *) hcmalloc (param_value_size);
 
-        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_wgs_memset) == -1) return -1;
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NAME, param_value_size, device_name, NULL) == -1) return -1;
 
-        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_local_mem_size_memset) == -1) return -1;
+        device_param->device_name = device_name;
 
-        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_dynamic_local_mem_size_memset) == -1) return -1;
+        hc_string_trim_leading (device_param->device_name);
 
-        device_param->kernel_preferred_wgs_multiple_memset = device_param->cuda_warp_size;
+        hc_string_trim_trailing (device_param->device_name);
 
-        //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem),   device_param->kernel_params_memset[0]); if (CL_rc == -1) return -1;
-        //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1;
-        //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1;
+        // device_vendor
 
-        // GPU autotune init
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VENDOR, 0, NULL, &param_value_size) == -1) return -1;
 
-        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_atinit, device_param->cuda_module_shared, "gpu_atinit") == -1) return -1;
+        char *opencl_device_vendor = (char *) hcmalloc (param_value_size);
 
-        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_wgs_atinit) == -1) return -1;
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VENDOR, param_value_size, opencl_device_vendor, NULL) == -1) return -1;
 
-        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_local_mem_size_atinit) == -1) return -1;
+        device_param->opencl_device_vendor = opencl_device_vendor;
 
-        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_dynamic_local_mem_size_atinit) == -1) return -1;
+        cl_uint opencl_device_vendor_id = 0;
 
-        device_param->kernel_preferred_wgs_multiple_atinit = device_param->cuda_warp_size;
+        if (strcmp (opencl_device_vendor, CL_VENDOR_AMD1) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_AMD;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_AMD2) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_AMD;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_AMD_USE_INTEL) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_AMD_USE_INTEL;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_APPLE;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE_USE_AMD) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_AMD;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE_USE_NV) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_NV;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_APPLE_USE_INTEL) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_INTEL_SDK;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_INTEL_BEIGNET) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_INTEL_BEIGNET;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_INTEL_SDK) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_INTEL_SDK;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_MESA) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_MESA;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_NV) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_NV;
+        }
+        else if (strcmp (opencl_device_vendor, CL_VENDOR_POCL) == 0)
+        {
+          opencl_device_vendor_id = VENDOR_ID_POCL;
+        }
+        else
+        {
+          opencl_device_vendor_id = VENDOR_ID_GENERIC;
+        }
 
-        // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem),   device_param->kernel_params_atinit[0]); if (CL_rc == -1) return -1;
-        // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]); if (CL_rc == -1) return -1;
+        device_param->opencl_device_vendor_id = opencl_device_vendor_id;
 
-        // GPU decompress
+        // device_version
 
-        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_decompress, device_param->cuda_module_shared, "gpu_decompress") == -1) return -1;
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VERSION, 0, NULL, &param_value_size) == -1) return -1;
 
-        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_wgs_decompress) == -1) return -1;
+        char *opencl_device_version = (char *) hcmalloc (param_value_size);
 
-        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_local_mem_size_decompress) == -1) return -1;
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_VERSION, param_value_size, opencl_device_version, NULL) == -1) return -1;
 
-        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_dynamic_local_mem_size_decompress) == -1) return -1;
+        device_param->opencl_device_version = opencl_device_version;
 
-        device_param->kernel_preferred_wgs_multiple_decompress = device_param->cuda_warp_size;
-      }
+        // opencl_device_c_version
 
-      if (device_param->is_opencl == true)
-      {
-        // GPU memset
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &param_value_size) == -1) return -1;
 
-        if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_memset", &device_param->opencl_kernel_memset) == -1) return -1;
+        char *opencl_device_c_version = (char *) hcmalloc (param_value_size);
 
-        if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_wgs_memset) == -1) return -1;
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_OPENCL_C_VERSION, param_value_size, opencl_device_c_version, NULL) == -1) return -1;
 
-        if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_local_mem_size_memset) == -1) return -1;
+        device_param->opencl_device_c_version = opencl_device_c_version;
 
-        if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_dynamic_local_mem_size_memset) == -1) return -1;
+        // max_compute_units
 
-        if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_preferred_wgs_multiple_memset) == -1) return -1;
+        cl_uint device_processors = 0;
 
-        // GPU autotune init
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (device_processors), &device_processors, NULL) == -1) return -1;
 
-        if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_atinit", &device_param->opencl_kernel_atinit) == -1) return -1;
+        device_param->device_processors = device_processors;
 
-        if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_wgs_atinit) == -1) return -1;
+        // device_global_mem
 
-        if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_local_mem_size_atinit) == -1) return -1;
+        cl_ulong device_global_mem = 0;
 
-        if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_dynamic_local_mem_size_atinit) == -1) return -1;
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof (device_global_mem), &device_global_mem, NULL) == -1) return -1;
 
-        if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_preferred_wgs_multiple_atinit) == -1) return -1;
+        device_param->device_global_mem = device_global_mem;
 
-        // GPU decompress
+        device_param->device_available_mem = 0;
 
-        if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_decompress", &device_param->opencl_kernel_decompress) == -1) return -1;
+        // device_maxmem_alloc
 
-        if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_wgs_decompress) == -1) return -1;
+        cl_ulong device_maxmem_alloc = 0;
 
-        if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_local_mem_size_decompress) == -1) return -1;
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof (device_maxmem_alloc), &device_maxmem_alloc, NULL) == -1) return -1;
 
-        if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_dynamic_local_mem_size_decompress) == -1) return -1;
+        device_param->device_maxmem_alloc = device_maxmem_alloc;
 
-        if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_preferred_wgs_multiple_decompress) == -1) return -1;
-      }
-    }
+        // note we'll limit to 2gb, otherwise this causes all kinds of weird errors because of possible integer overflows in opencl runtimes
+        // testwise disabling that
+        //device_param->device_maxmem_alloc = MIN (device_maxmem_alloc, 0x7fffffff);
 
-    /**
-     * main kernel
-     */
+        // max_work_group_size
 
-    {
-      char *build_options_module_buf = (char *) hcmalloc (build_options_sz);
+        size_t device_maxworkgroup_size = 0;
 
-      int build_options_module_len = 0;
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (device_maxworkgroup_size), &device_maxworkgroup_size, NULL) == -1) return -1;
 
-      build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s ", build_options_buf);
+        device_param->device_maxworkgroup_size = device_maxworkgroup_size;
 
-      if (module_ctx->module_jit_build_options != MODULE_DEFAULT)
-      {
-        char *jit_build_options = module_ctx->module_jit_build_options (hashconfig, user_options, user_options_extra, hashes, device_param);
+        // max_clock_frequency
 
-        if (jit_build_options != NULL)
-        {
-          build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s", jit_build_options);
+        cl_uint device_maxclock_frequency = 0;
 
-          // this is a bit ugly
-          // would be better to have the module return the value as value
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof (device_maxclock_frequency), &device_maxclock_frequency, NULL) == -1) return -1;
 
-          u32 fixed_local_size = 0;
+        device_param->device_maxclock_frequency = device_maxclock_frequency;
 
-          if (sscanf (jit_build_options, "-D FIXED_LOCAL_SIZE=%u", &fixed_local_size) == 1)
-          {
-            device_param->kernel_threads_min = fixed_local_size;
-            device_param->kernel_threads_max = fixed_local_size;
-          }
-        }
-      }
+        // device_endian_little
 
-      build_options_module_buf[build_options_module_len] = 0;
+        cl_bool device_endian_little = CL_FALSE;
 
-      #if defined (DEBUG)
-      if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_options_module '%s'", device_id + 1, build_options_module_buf);
-      #endif
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_ENDIAN_LITTLE, sizeof (device_endian_little), &device_endian_little, NULL) == -1) return -1;
 
-      /**
-       * kernel source filename
-       */
+        if (device_endian_little == CL_FALSE)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: This device is not little-endian.", device_id + 1);
 
-      char source_file[256] = { 0 };
+          device_param->skipped = true;
+        }
 
-      generate_source_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->shared_dir, source_file);
+        // device_available
 
-      if (hc_path_read (source_file) == false)
-      {
-        event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
-
-        return -1;
-      }
-
-      /**
-       * kernel cached filename
-       */
+        cl_bool device_available = CL_FALSE;
 
-      char cached_file[256] = { 0 };
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_AVAILABLE, sizeof (device_available), &device_available, NULL) == -1) return -1;
 
-      generate_cached_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->profile_dir, device_name_chksum, cached_file);
+        if (device_available == CL_FALSE)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: This device is not available.", device_id + 1);
 
-      /**
-       * load kernel
-       */
+          device_param->skipped = true;
+        }
 
-      const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "main_kernel", source_file, cached_file, build_options_module_buf, cache_disable, &device_param->opencl_program, &device_param->cuda_module);
+        // device_compiler_available
 
-      if (rc_load_kernel == false)
-      {
-        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
+        cl_bool device_compiler_available = CL_FALSE;
 
-        return -1;
-      }
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_COMPILER_AVAILABLE, sizeof (device_compiler_available), &device_compiler_available, NULL) == -1) return -1;
 
-      hcfree (build_options_module_buf);
-    }
+        if (device_compiler_available == CL_FALSE)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: No compiler is available for this device.", device_id + 1);
 
-    /**
-     * word generator kernel
-     */
+          device_param->skipped = true;
+        }
 
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      if (user_options->attack_mode != ATTACK_MODE_STRAIGHT)
-      {
-        /**
-         * kernel mp source filename
-         */
+        // device_execution_capabilities
 
-        char source_file[256] = { 0 };
+        cl_device_exec_capabilities device_execution_capabilities;
 
-        generate_source_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->shared_dir, source_file);
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof (device_execution_capabilities), &device_execution_capabilities, NULL) == -1) return -1;
 
-        if (hc_path_read (source_file) == false)
+        if ((device_execution_capabilities & CL_EXEC_KERNEL) == 0)
         {
-          event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
+          event_log_error (hashcat_ctx, "* Device #%u: This device does not support executing kernels.", device_id + 1);
 
-          return -1;
+          device_param->skipped = true;
         }
 
-        /**
-         * kernel mp cached filename
-         */
+        // device_extensions
 
-        char cached_file[256] = { 0 };
+        size_t device_extensions_size;
 
-        generate_cached_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file);
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_EXTENSIONS, 0, NULL, &device_extensions_size) == -1) return -1;
+
+        char *device_extensions = (char *) hcmalloc (device_extensions_size + 1);
 
-        const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "mp_kernel", source_file, cached_file, build_options_buf, cache_disable, &device_param->opencl_program_mp, &device_param->cuda_module_mp);
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_EXTENSIONS, device_extensions_size, device_extensions, NULL) == -1) return -1;
 
-        if (rc_load_kernel == false)
+        if (strstr (device_extensions, "base_atomics") == 0)
         {
-          event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
+          event_log_error (hashcat_ctx, "* Device #%u: This device does not support base atomics.", device_id + 1);
 
-          return -1;
+          device_param->skipped = true;
         }
-      }
-    }
-
-    /**
-     * amplifier kernel
-     */
 
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-      {
+        if (strstr (device_extensions, "byte_addressable_store") == 0)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: This device does not support byte-addressable store.", device_id + 1);
 
-      }
-      else
-      {
-        /**
-         * kernel amp source filename
-         */
+          device_param->skipped = true;
+        }
 
-        char source_file[256] = { 0 };
+        hcfree (device_extensions);
 
-        generate_source_kernel_amp_filename (user_options_extra->attack_kern, folder_config->shared_dir, source_file);
+        // device_local_mem_type
 
-        if (hc_path_read (source_file) == false)
-        {
-          event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
+        cl_device_local_mem_type device_local_mem_type;
 
-          return -1;
-        }
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof (device_local_mem_type), &device_local_mem_type, NULL) == -1) return -1;
 
-        /**
-         * kernel amp cached filename
-         */
+        device_param->device_local_mem_type = device_local_mem_type;
 
-        char cached_file[256] = { 0 };
+        // device_max_constant_buffer_size
 
-        generate_cached_kernel_amp_filename (user_options_extra->attack_kern, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file);
+        cl_ulong device_max_constant_buffer_size;
 
-        const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "amp_kernel", source_file, cached_file, build_options_buf, cache_disable, &device_param->opencl_program_amp, &device_param->cuda_module_amp);
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof (device_max_constant_buffer_size), &device_max_constant_buffer_size, NULL) == -1) return -1;
 
-        if (rc_load_kernel == false)
+        if (device_local_mem_type == CL_LOCAL)
         {
-          event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
+          if (device_max_constant_buffer_size < 65536)
+          {
+            event_log_error (hashcat_ctx, "* Device #%u: This device's constant buffer size is too small.", device_id + 1);
 
-          return -1;
+            device_param->skipped = true;
+          }
         }
 
-        hcfree (build_options_buf);
-      }
-    }
+        // device_local_mem_size
 
-    hcfree (device_name_chksum);
-    hcfree (device_name_chksum_amp_mp);
+        cl_ulong device_local_mem_size = 0;
 
-    // some algorithm collide too fast, make that impossible
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (device_local_mem_size), &device_local_mem_size, NULL) == -1) return -1;
 
-    if (user_options->benchmark == true)
-    {
-      ((u32 *) hashes->digests_buf)[0] = -1U;
-      ((u32 *) hashes->digests_buf)[1] = -1U;
-      ((u32 *) hashes->digests_buf)[2] = -1U;
-      ((u32 *) hashes->digests_buf)[3] = -1U;
-    }
+        if (device_local_mem_type == CL_LOCAL)
+        {
+          if (device_local_mem_size < 32768)
+          {
+            event_log_error (hashcat_ctx, "* Device #%u: This device's local mem size is too small.", device_id + 1);
 
-    /**
-     * global buffers
-     */
+            device_param->skipped = true;
+          }
+        }
 
-    const u64 size_total_fixed
-      = bitmap_ctx->bitmap_size
-      + bitmap_ctx->bitmap_size
-      + bitmap_ctx->bitmap_size
-      + bitmap_ctx->bitmap_size
-      + bitmap_ctx->bitmap_size
-      + bitmap_ctx->bitmap_size
-      + bitmap_ctx->bitmap_size
-      + bitmap_ctx->bitmap_size
-      + size_plains
-      + size_digests
-      + size_shown
-      + size_salts
-      + size_results
-      + size_extra_buffer
-      + size_st_digests
-      + size_st_salts
-      + size_st_esalts
-      + size_esalts
-      + size_markov_css
-      + size_root_css
-      + size_rules
-      + size_rules_c
-      + size_tm;
+        device_param->device_local_mem_size = device_local_mem_size;
 
-    if (size_total_fixed > device_param->device_available_mem)
-    {
-      event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory for this hashlist and/or ruleset.", device_id + 1);
+        // older POCL version and older LLVM versions are known to fail compiling kernels
+        // we need to inform the user to update
+        // https://github.com/hashcat/hashcat/issues/2344
 
-      return -1;
-    }
+        if (opencl_platform_vendor_id == VENDOR_ID_POCL)
+        {
+          char *pocl_version_ptr = strstr (opencl_platform_version, "pocl ");
+          char *llvm_version_ptr = strstr (opencl_platform_version, "LLVM ");
 
-    if (device_param->is_cuda == true)
-    {
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_a,    bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_b,    bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_c,    bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_d,    bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_a,    bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_b,    bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_c,    bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_d,    bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_plain_bufs,     size_plains)             == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_digests_buf,    size_digests)            == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_digests_shown,  size_shown)              == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_salt_bufs,      size_salts)              == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_result,         size_results)            == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra0_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra1_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra2_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra3_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_digests_buf, size_st_digests)         == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_salts_buf,   size_st_salts)           == -1) return -1;
+          if ((pocl_version_ptr != NULL) && (llvm_version_ptr != NULL))
+          {
+            bool pocl_skip = false;
 
-      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size) == -1) return -1;
-      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_digests_buf, hashes->digests_buf,     size_digests)            == -1) return -1;
-      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_salt_bufs,   hashes->salts_buf,       size_salts)              == -1) return -1;
+            int pocl_maj = 0;
+            int pocl_min = 0;
 
-      /**
-       * special buffers
-       */
+            const int res1 = sscanf (pocl_version_ptr, "pocl %d.%d", &pocl_maj, &pocl_min);
 
-      if (user_options->slow_candidates == true)
-      {
-        if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules_c, size_rules_c) == -1) return -1;
-      }
-      else
-      {
-        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-        {
-          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules,   size_rules) == -1) return -1;
+            if (res1 == 2)
+            {
+              const int pocl_version = (pocl_maj * 100) + pocl_min;
 
-          if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-          {
-            size_t dummy = 0;
+              if (pocl_version < 105)
+              {
+                pocl_skip = true;
+              }
+            }
 
-            if (hc_cuModuleGetGlobal (hashcat_ctx, &device_param->cuda_d_rules_c, &dummy, device_param->cuda_module, "generic_constant") == -1) return -1;
-          }
-          else
+            int llvm_maj = 0;
+            int llvm_min = 0;
+
+            const int res2 = sscanf (llvm_version_ptr, "LLVM %d.%d", &llvm_maj, &llvm_min);
+
+            if (res2 == 2)
+            {
+              const int llvm_version = (llvm_maj * 100) + llvm_min;
+
+              if (llvm_version < 900)
+              {
+                pocl_skip = true;
+              }
+            }
+
+            if (pocl_skip == true)
+            {
+              if (user_options->force == false)
+              {
+                event_log_error (hashcat_ctx, "* Device #%u: Outdated POCL OpenCL driver detected!", device_id + 1);
+
+                if (user_options->quiet == false) event_log_warning (hashcat_ctx, "This OpenCL driver has been marked as likely to fail kernel compilation or to produce false negatives.");
+                if (user_options->quiet == false) event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
+                if (user_options->quiet == false) event_log_warning (hashcat_ctx, NULL);
+
+                device_param->skipped = true;
+              }
+            }
+          }
+        }
+
+        char *opencl_device_version_lower = hcstrdup (opencl_device_version);
+
+        lowercase ((u8 *) opencl_device_version_lower, strlen (opencl_device_version_lower));
+
+        if ((strstr (opencl_device_version_lower, "neo "))
+         || (strstr (opencl_device_version_lower, " neo"))
+         || (strstr (opencl_device_version_lower, "beignet "))
+         || (strstr (opencl_device_version_lower, " beignet"))
+         || (strstr (opencl_device_version_lower, "mesa "))
+         || (strstr (opencl_device_version_lower, " mesa")))
+        {
+          // NEO:     https://github.com/hashcat/hashcat/issues/2342
+          // BEIGNET: https://github.com/hashcat/hashcat/issues/2243
+          // MESA:    https://github.com/hashcat/hashcat/issues/2269
+
+          if (user_options->force == false)
+          {
+            event_log_error (hashcat_ctx, "* Device #%u: Unstable OpenCL driver detected!", device_id + 1);
+
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "This OpenCL driver has been marked as likely to fail kernel compilation or to produce false negatives.");
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
+            if (user_options->quiet == false) event_log_warning (hashcat_ctx, NULL);
+
+            device_param->skipped = true;
+          }
+        }
+
+        hcfree (opencl_device_version_lower);
+
+        // Since some times we get reports from users about not working hashcat, dropping error messages like:
+        // CL_INVALID_COMMAND_QUEUE and CL_OUT_OF_RESOURCES
+        // Turns out that this is caused by Intel OpenCL runtime handling their GPU devices
+        // Disable such devices unless the user forces to use it
+        // This is successfully workaround with new threading model and new memory management
+        // Tested on Windows 10
+        // OpenCL.Version.: OpenCL C 2.1
+        // Driver.Version.: 23.20.16.4973
+
+        /*
+        #if !defined (__APPLE__)
+        if (opencl_device_type & CL_DEVICE_TYPE_GPU)
+        {
+          if ((device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK) || (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_BEIGNET))
+          {
+            if (user_options->force == false)
+            {
+              if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Intel's OpenCL runtime (GPU only) is currently broken.", device_id + 1);
+              if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             We are waiting for updated OpenCL drivers from Intel.");
+              if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             You can use --force to override, but do not report related errors.");
+
+              device_param->skipped = true;
+            }
+          }
+        }
+        #endif // __APPLE__
+        */
+
+        // skipped
+
+        if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0)
+        {
+          device_param->skipped = true;
+        }
+
+        if ((backend_ctx->opencl_device_types_filter & (opencl_device_type)) == 0)
+        {
+          device_param->skipped = true;
+        }
+
+        // driver_version
+
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DRIVER_VERSION, 0, NULL, &param_value_size) == -1) return -1;
+
+        char *opencl_driver_version = (char *) hcmalloc (param_value_size);
+
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DRIVER_VERSION, param_value_size, opencl_driver_version, NULL) == -1) return -1;
+
+        device_param->opencl_driver_version = opencl_driver_version;
+
+        // vendor specific
+
+        if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+        {
+          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
+          {
+            need_adl = true;
+
+            #if defined (__linux__)
+            need_sysfs = true;
+            #endif
+          }
+
+          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
+          {
+            need_nvml = true;
+
+            #if defined (_WIN) || defined (__CYGWIN__)
+            need_nvapi = true;
+            #endif
+          }
+        }
+
+        if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+        {
+          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
+          {
+            cl_device_topology_amd amdtopo;
+
+            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_TOPOLOGY_AMD, sizeof (amdtopo), &amdtopo, NULL) == -1) return -1;
+
+            device_param->pcie_domain   = 0; // no attribute to query
+            device_param->pcie_bus      = amdtopo.pcie.bus;
+            device_param->pcie_device   = amdtopo.pcie.device;
+            device_param->pcie_function = amdtopo.pcie.function;
+          }
+
+          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
+          {
+            cl_uint pci_bus_id_nv;  // is cl_uint the right type for them??
+            cl_uint pci_slot_id_nv;
+
+            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_PCI_BUS_ID_NV, sizeof (pci_bus_id_nv), &pci_bus_id_nv, NULL) == -1) return -1;
+
+            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_PCI_SLOT_ID_NV, sizeof (pci_slot_id_nv), &pci_slot_id_nv, NULL) == -1) return -1;
+
+            device_param->pcie_domain   = 0; // no attribute to query
+            device_param->pcie_bus      = (u8) (pci_bus_id_nv);
+            device_param->pcie_device   = (u8) (pci_slot_id_nv >> 3);
+            device_param->pcie_function = (u8) (pci_slot_id_nv & 7);
+
+            int sm_minor = 0;
+            int sm_major = 0;
+
+            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof (sm_minor), &sm_minor, NULL) == -1) return -1;
+
+            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof (sm_major), &sm_major, NULL) == -1) return -1;
+
+            device_param->sm_minor = sm_minor;
+            device_param->sm_major = sm_major;
+
+            cl_uint kernel_exec_timeout = 0;
+
+            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof (kernel_exec_timeout), &kernel_exec_timeout, NULL) == -1) return -1;
+
+            device_param->kernel_exec_timeout = kernel_exec_timeout;
+
+            // CPU burning loop damper
+            // Value is given as number between 0-100
+            // By default 8%
+
+            device_param->spin_damp = (double) user_options->spin_damp / 100;
+
+            // recommend CUDA
+
+            if ((backend_ctx->cuda == NULL) || (backend_ctx->nvrtc == NULL))
+            {
+              event_log_warning (hashcat_ctx, "* Device #%u: CUDA SDK Toolkit installation NOT detected.", device_id + 1);
+              event_log_warning (hashcat_ctx, "             CUDA SDK Toolkit installation required for proper device support and utilization");
+              event_log_warning (hashcat_ctx, "             Falling back to OpenCL Runtime");
+
+              event_log_warning (hashcat_ctx, NULL);
+            }
+          }
+        }
+
+        // common driver check
+
+        if (device_param->skipped == false)
+        {
+          if ((user_options->force == false) && (user_options->backend_info == false))
+          {
+            if (opencl_device_type & CL_DEVICE_TYPE_CPU)
+            {
+              if (device_param->opencl_platform_vendor_id == VENDOR_ID_INTEL_SDK)
+              {
+                bool intel_warn = false;
+
+                // Intel OpenCL runtime 18
+
+                int opencl_driver1 = 0;
+                int opencl_driver2 = 0;
+                int opencl_driver3 = 0;
+                int opencl_driver4 = 0;
+
+                const int res18 = sscanf (device_param->opencl_driver_version, "%d.%d.%d.%d", &opencl_driver1, &opencl_driver2, &opencl_driver3, &opencl_driver4);
+
+                if (res18 == 4)
+                {
+                  // so far all versions 18 are ok
+                }
+                else
+                {
+                  // Intel OpenCL runtime 16
+
+                  float opencl_version = 0;
+                  int   opencl_build   = 0;
+
+                  const int res16 = sscanf (device_param->opencl_device_version, "OpenCL %f (Build %d)", &opencl_version, &opencl_build);
+
+                  if (res16 == 2)
+                  {
+                    if (opencl_build < 25) intel_warn = true;
+                  }
+                }
+
+                if (intel_warn == true)
+                {
+                  event_log_error (hashcat_ctx, "* Device #%u: Outdated or broken Intel OpenCL runtime '%s' detected!", device_id + 1, device_param->opencl_driver_version);
+
+                  event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported Intel OpenCL runtime.");
+                  event_log_warning (hashcat_ctx, "See hashcat.net for officially supported Intel OpenCL runtime.");
+                  event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
+                  event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
+                  event_log_warning (hashcat_ctx, NULL);
+
+                  return -1;
+                }
+              }
+            }
+            else if (opencl_device_type & CL_DEVICE_TYPE_GPU)
+            {
+              if (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)
+              {
+                bool amd_warn = true;
+
+                #if defined (__linux__)
+                // AMDGPU-PRO Driver 16.40 and higher
+                if (strtoul (device_param->opencl_driver_version, NULL, 10) >= 2117) amd_warn = false;
+                // AMDGPU-PRO Driver 16.50 is known to be broken
+                if (strtoul (device_param->opencl_driver_version, NULL, 10) == 2236) amd_warn = true;
+                // AMDGPU-PRO Driver 16.60 is known to be broken
+                if (strtoul (device_param->opencl_driver_version, NULL, 10) == 2264) amd_warn = true;
+                // AMDGPU-PRO Driver 17.10 is known to be broken
+                if (strtoul (device_param->opencl_driver_version, NULL, 10) == 2348) amd_warn = true;
+                // AMDGPU-PRO Driver 17.20 (2416) is fine, doesn't need check will match >= 2117
+                #elif defined (_WIN)
+                // AMD Radeon Software 14.9 and higher, should be updated to 15.12
+                if (strtoul (device_param->opencl_driver_version, NULL, 10) >= 1573) amd_warn = false;
+                #else
+                // we have no information about other os
+                if (amd_warn == true) amd_warn = false;
+                #endif
+
+                if (amd_warn == true)
+                {
+                  event_log_error (hashcat_ctx, "* Device #%u: Outdated or broken AMD driver '%s' detected!", device_id + 1, device_param->opencl_driver_version);
+
+                  event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported AMD driver.");
+                  event_log_warning (hashcat_ctx, "See hashcat.net for officially supported AMD drivers.");
+                  event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
+                  event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
+                  event_log_warning (hashcat_ctx, NULL);
+
+                  return -1;
+                }
+              }
+
+              if (device_param->opencl_platform_vendor_id == VENDOR_ID_NV)
+              {
+                int nv_warn = true;
+
+                int version_maj = 0;
+                int version_min = 0;
+
+                const int r = sscanf (device_param->opencl_driver_version, "%d.%d", &version_maj, &version_min);
+
+                if (r == 2)
+                {
+                  // nvidia 441.x looks ok
+
+                  if (version_maj == 440)
+                  {
+                    if (version_min >= 64)
+                    {
+                      nv_warn = false;
+                    }
+                  }
+                  else
+                  {
+                    // unknown version scheme, probably new driver version
+
+                    nv_warn = false;
+                  }
+                }
+                else
+                {
+                  // unknown version scheme, probably new driver version
+
+                  nv_warn = false;
+                }
+
+                if (nv_warn == true)
+                {
+                  event_log_warning (hashcat_ctx, "* Device #%u: Outdated or broken NVIDIA driver '%s' detected!", device_id + 1, device_param->opencl_driver_version);
+                  event_log_warning (hashcat_ctx, NULL);
+
+                  event_log_warning (hashcat_ctx, "You are STRONGLY encouraged to use the officially supported NVIDIA driver.");
+                  event_log_warning (hashcat_ctx, "See hashcat's homepage for officially supported NVIDIA drivers.");
+                  event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
+                  event_log_warning (hashcat_ctx, "You can use --force to override this, but do not report related errors.");
+                  event_log_warning (hashcat_ctx, NULL);
+
+                  return -1;
+                }
+
+                if (device_param->sm_major < 5)
+                {
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: This hardware has outdated CUDA compute capability (%u.%u).", device_id + 1, device_param->sm_major, device_param->sm_minor);
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             For modern OpenCL performance, upgrade to hardware that supports");
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             CUDA compute capability version 5.0 (Maxwell) or higher.");
+                }
+
+                if (device_param->kernel_exec_timeout != 0)
+                {
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: WARNING! Kernel exec timeout is not disabled.", device_id + 1);
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             This may cause \"CL_OUT_OF_RESOURCES\" or related errors.");
+                  if (user_options->quiet == false) event_log_warning (hashcat_ctx, "             To disable the timeout, see: https://hashcat.net/q/timeoutpatch");
+                }
+              }
+            }
+          }
+
+          /**
+           * activate device
+           */
+
+          opencl_devices_active++;
+        }
+
+        /**
+         * create context for each device
+         */
+
+        cl_context context;
+
+        /*
+        cl_context_properties properties[3];
+
+        properties[0] = CL_CONTEXT_PLATFORM;
+        properties[1] = (cl_context_properties) device_param->opencl_platform;
+        properties[2] = 0;
+
+        CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context);
+        */
+
+        if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context) == -1) return -1;
+
+        /**
+         * create command-queue
+         */
+
+        cl_command_queue command_queue;
+
+        if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1) return -1;
+
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))
+        {
+          #define RUN_INSTRUCTION_CHECKS()
+            device_param->has_vadd     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+            device_param->has_vaddc    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+            device_param->has_vadd_co  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+            device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+            device_param->has_vsub     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+            device_param->has_vsubb    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+            device_param->has_vsub_co  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+            device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+            device_param->has_vadd3    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+            device_param->has_vbfe     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32     %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+            device_param->has_vperm    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+
+          if (backend_devices_idx > 0)
+          {
+            hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
+
+            if (is_same_device_type (device_param, device_param_prev) == true)
+            {
+              device_param->has_vadd     = device_param_prev->has_vadd;
+              device_param->has_vaddc    = device_param_prev->has_vaddc;
+              device_param->has_vadd_co  = device_param_prev->has_vadd_co;
+              device_param->has_vaddc_co = device_param_prev->has_vaddc_co;
+              device_param->has_vsub     = device_param_prev->has_vsub;
+              device_param->has_vsubb    = device_param_prev->has_vsubb;
+              device_param->has_vsub_co  = device_param_prev->has_vsub_co;
+              device_param->has_vsubb_co = device_param_prev->has_vsubb_co;
+              device_param->has_vadd3    = device_param_prev->has_vadd3;
+              device_param->has_vbfe     = device_param_prev->has_vbfe;
+              device_param->has_vperm    = device_param_prev->has_vperm;
+            }
+            else
+            {
+              RUN_INSTRUCTION_CHECKS();
+            }
+          }
+          else
+          {
+            RUN_INSTRUCTION_CHECKS();
+          }
+
+          #undef RUN_INSTRUCTION_CHECKS
+        }
+
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
+        {
+          const int sm = (device_param->sm_major * 10) + device_param->sm_minor;
+
+          device_param->has_add   = (sm >= 12) ? true : false;
+          device_param->has_addc  = (sm >= 12) ? true : false;
+          device_param->has_sub   = (sm >= 12) ? true : false;
+          device_param->has_subc  = (sm >= 12) ? true : false;
+          device_param->has_bfe   = (sm >= 20) ? true : false;
+          device_param->has_lop3  = (sm >= 50) ? true : false;
+          device_param->has_mov64 = (sm >= 10) ? true : false;
+          device_param->has_prmt  = (sm >= 20) ? true : false;
+
+          /*
+          #define RUN_INSTRUCTION_CHECKS()                                                                                                                                                                                                          \
+            device_param->has_add   = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                        \
+            device_param->has_addc  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                       \
+            device_param->has_sub   = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                        \
+            device_param->has_subc  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                       \
+            device_param->has_bfe   = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                        \
+            device_param->has_lop3  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");                                    \
+            device_param->has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
+            device_param->has_prmt  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                       \
+
+          if (backend_devices_idx > 0)
+          {
+            hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
+
+            if (is_same_device_type (device_param, device_param_prev) == true)
+            {
+              device_param->has_add   = device_param_prev->has_add;
+              device_param->has_addc  = device_param_prev->has_addc;
+              device_param->has_sub   = device_param_prev->has_sub;
+              device_param->has_subc  = device_param_prev->has_subc;
+              device_param->has_bfe   = device_param_prev->has_bfe;
+              device_param->has_lop3  = device_param_prev->has_lop3;
+              device_param->has_mov64 = device_param_prev->has_mov64;
+              device_param->has_prmt  = device_param_prev->has_prmt;
+            }
+            else
+            {
+              RUN_INSTRUCTION_CHECKS();
+            }
+          }
+          else
+          {
+            RUN_INSTRUCTION_CHECKS();
+          }
+
+          #undef RUN_INSTRUCTION_CHECKS
+          */
+        }
+
+        // device_available_mem
+
+        #define MAX_ALLOC_CHECKS_CNT  8192
+        #define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024)
+
+        device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE;
+
+        #if defined (_WIN)
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
+        #else
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) || (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)))
+        #endif
+        {
+          // OK, so the problem here is the following:
+          // There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device,
+          // but there's no way to ask for available memory on the device.
+          // In combination, most OpenCL runtimes implementation of clCreateBuffer()
+          // are doing so called lazy memory allocation on the device.
+          // Now, if the user has X11 (or a game or anything that takes a lot of GPU memory)
+          // running on the host we end up with an error type of this:
+          // clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE
+          // The clEnqueueNDRangeKernel() is because of the lazy allocation
+          // The best way to workaround this problem is if we would be able to ask for available memory,
+          // The idea here is to try to evaluate available memory by allocating it till it errors
+
+          cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem));
+
+          u64 c;
+
+          for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
+          {
+            if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
+
+            cl_int CL_err;
+
+            OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
+
+            tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
+
+            if (CL_err != CL_SUCCESS)
+            {
+              c--;
+
+              break;
+            }
+
+            // transfer only a few byte should be enough to force the runtime to actually allocate the memory
+
+            u8 tmp_host[8];
+
+            if (ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+
+            if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+
+            if (ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+
+            if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+          }
+
+          device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
+          if (c > 0)
+          {
+            device_param->device_available_mem *= c;
+          }
+
+          // clean up
+
+          for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
+          {
+            if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
+
+            if (tmp_device[c] != NULL)
+            {
+              if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1;
+            }
+          }
+
+          hcfree (tmp_device);
+        }
+
+        hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
+
+        hc_clReleaseContext (hashcat_ctx, context);
+      }
+    }
+  }
+
+  backend_ctx->opencl_devices_cnt     = opencl_devices_cnt;
+  backend_ctx->opencl_devices_active  = opencl_devices_active;
+
+  // all devices combined go into backend_* variables
+
+  backend_ctx->backend_devices_cnt    = cuda_devices_cnt    + hip_devices_cnt    + opencl_devices_cnt;
+  backend_ctx->backend_devices_active = cuda_devices_active + hip_devices_active + opencl_devices_active;
+
+  // find duplicate devices
+
+  //if ((cuda_devices_cnt > 0) && (opencl_devices_cnt > 0))
+  //{
+    // using force here enables both devices, which is the worst possible outcome
+    // many users force by default, so this is not a good idea
+
+    //if (user_options->force == false)
+    //{
+    backend_ctx_find_alias_devices (hashcat_ctx);
+    //{
+  //}
+
+  if (backend_ctx->backend_devices_active == 0)
+  {
+    event_log_error (hashcat_ctx, "No devices found/left.");
+
+    return -1;
+  }
+
+  // now we can calculate the number of parallel running hook threads based on
+  // the number cpu cores and the number of active compute devices
+  // unless overwritten by the user
+
+  if (user_options->hook_threads == HOOK_THREADS)
+  {
+    const u32 processor_count = hc_get_processor_count ();
+
+    const u32 processor_count_cu = CEILDIV (processor_count, backend_ctx->backend_devices_active); // should never reach 0
+
+    user_options->hook_threads = processor_count_cu;
+  }
+
+  // additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt)
+
+  if (backend_ctx->backend_devices_filter != (u64) -1)
+  {
+    const u64 backend_devices_cnt_mask = ~(((u64) -1 >> backend_ctx->backend_devices_cnt) << backend_ctx->backend_devices_cnt);
+
+    if (backend_ctx->backend_devices_filter > backend_devices_cnt_mask)
+    {
+      event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter.");
+      event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt);
+
+      return -1;
+    }
+  }
+
+  backend_ctx->target_msec  = TARGET_MSEC_PROFILE[user_options->workload_profile - 1];
+
+  backend_ctx->need_adl     = need_adl;
+  backend_ctx->need_nvml    = need_nvml;
+  backend_ctx->need_nvapi   = need_nvapi;
+  backend_ctx->need_sysfs   = need_sysfs;
+
+  backend_ctx->comptime     = comptime;
+
+  return 0;
+}
+
+void backend_ctx_devices_destroy (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (backend_ctx->enabled == false) return;
+
+  for (u32 opencl_platforms_idx = 0; opencl_platforms_idx < backend_ctx->opencl_platforms_cnt; opencl_platforms_idx++)
+  {
+    hcfree (backend_ctx->opencl_platforms_devices[opencl_platforms_idx]);
+    hcfree (backend_ctx->opencl_platforms_name[opencl_platforms_idx]);
+    hcfree (backend_ctx->opencl_platforms_vendor[opencl_platforms_idx]);
+    hcfree (backend_ctx->opencl_platforms_version[opencl_platforms_idx]);
+  }
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    hcfree (device_param->device_name);
+
+    if (device_param->is_opencl == true)
+    {
+      hcfree (device_param->opencl_driver_version);
+      hcfree (device_param->opencl_device_version);
+      hcfree (device_param->opencl_device_c_version);
+      hcfree (device_param->opencl_device_vendor);
+    }
+  }
+
+  backend_ctx->backend_devices_cnt    = 0;
+  backend_ctx->backend_devices_active = 0;
+  backend_ctx->cuda_devices_cnt       = 0;
+  backend_ctx->cuda_devices_active    = 0;
+  backend_ctx->hip_devices_cnt       = 0;
+  backend_ctx->hip_devices_active    = 0;  
+  backend_ctx->opencl_devices_cnt     = 0;
+  backend_ctx->opencl_devices_active  = 0;
+
+  backend_ctx->need_adl    = false;
+  backend_ctx->need_nvml   = false;
+  backend_ctx->need_nvapi  = false;
+  backend_ctx->need_sysfs  = false;
+}
+
+void backend_ctx_devices_sync_tuning (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (backend_ctx->enabled == false) return;
+
+  for (int backend_devices_cnt_src = 0; backend_devices_cnt_src < backend_ctx->backend_devices_cnt; backend_devices_cnt_src++)
+  {
+    hc_device_param_t *device_param_src = &backend_ctx->devices_param[backend_devices_cnt_src];
+
+    if (device_param_src->skipped == true) continue;
+
+    if (device_param_src->skipped_warning == true) continue;
+
+    for (int backend_devices_cnt_dst = backend_devices_cnt_src + 1; backend_devices_cnt_dst < backend_ctx->backend_devices_cnt; backend_devices_cnt_dst++)
+    {
+      hc_device_param_t *device_param_dst = &backend_ctx->devices_param[backend_devices_cnt_dst];
+
+      if (device_param_dst->skipped == true) continue;
+
+      if (device_param_dst->skipped_warning == true) continue;
+
+      if (is_same_device_type (device_param_src, device_param_dst) == false) continue;
+
+      device_param_dst->kernel_accel   = device_param_src->kernel_accel;
+      device_param_dst->kernel_loops   = device_param_src->kernel_loops;
+      device_param_dst->kernel_threads = device_param_src->kernel_threads;
+
+      const u32 hardware_power = device_param_dst->device_processors * device_param_dst->kernel_threads;
+
+      device_param_dst->hardware_power = hardware_power;
+
+      const u32 kernel_power = device_param_dst->hardware_power * device_param_dst->kernel_accel;
+
+      device_param_dst->kernel_power = kernel_power;
+    }
+  }
+}
+
+void backend_ctx_devices_update_power (hashcat_ctx_t *hashcat_ctx)
+{
+  backend_ctx_t        *backend_ctx         = hashcat_ctx->backend_ctx;
+  status_ctx_t         *status_ctx          = hashcat_ctx->status_ctx;
+  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
+  user_options_t       *user_options        = hashcat_ctx->user_options;
+
+  if (backend_ctx->enabled == false) return;
+
+  u32 kernel_power_all = 0;
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    if (device_param->skipped_warning == true) continue;
+
+    kernel_power_all += device_param->kernel_power;
+  }
+
+  backend_ctx->kernel_power_all = kernel_power_all;
+
+  /*
+   * Inform user about possible slow speeds
+   */
+
+  if ((user_options_extra->wordlist_mode == WL_MODE_FILE) || (user_options_extra->wordlist_mode == WL_MODE_MASK))
+  {
+    if (status_ctx->words_base < kernel_power_all)
+    {
+      if (user_options->quiet == false)
+      {
+        event_log_advice (hashcat_ctx, "The wordlist or mask that you are using is too small.");
+        event_log_advice (hashcat_ctx, "This means that hashcat cannot use the full parallel power of your device(s).");
+        event_log_advice (hashcat_ctx, "Unless you supply more work, your cracking speed will drop.");
+        event_log_advice (hashcat_ctx, "For tips on supplying more work, see: https://hashcat.net/faq/morework");
+        event_log_advice (hashcat_ctx, NULL);
+      }
+    }
+  }
+}
+
+void backend_ctx_devices_kernel_loops (hashcat_ctx_t *hashcat_ctx)
+{
+  combinator_ctx_t     *combinator_ctx      = hashcat_ctx->combinator_ctx;
+  hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
+  hashes_t             *hashes              = hashcat_ctx->hashes;
+  mask_ctx_t           *mask_ctx            = hashcat_ctx->mask_ctx;
+  backend_ctx_t        *backend_ctx         = hashcat_ctx->backend_ctx;
+  straight_ctx_t       *straight_ctx        = hashcat_ctx->straight_ctx;
+  user_options_t       *user_options        = hashcat_ctx->user_options;
+  user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
+
+  if (backend_ctx->enabled == false) return;
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    if (device_param->skipped_warning == true) continue;
+
+    device_param->kernel_loops_min = device_param->kernel_loops_min_sav;
+    device_param->kernel_loops_max = device_param->kernel_loops_max_sav;
+
+    if (device_param->kernel_loops_min < device_param->kernel_loops_max)
+    {
+      u32 innerloop_cnt = 0;
+
+      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+      {
+        if (user_options->slow_candidates == true)
+        {
+          innerloop_cnt = 1;
+        }
+        else
+        {
+          if      (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)  innerloop_cnt = MIN (KERNEL_RULES, (u32) straight_ctx->kernel_rules_cnt);
+          else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)     innerloop_cnt = MIN (KERNEL_COMBS, (u32) combinator_ctx->combs_cnt);
+          else if (user_options_extra->attack_kern == ATTACK_KERN_BF)        innerloop_cnt = MIN (KERNEL_BFS,   (u32) mask_ctx->bfs_cnt);
+        }
+      }
+      else
+      {
+        innerloop_cnt = hashes->salts_buf[0].salt_iter;
+      }
+
+      if ((innerloop_cnt >= device_param->kernel_loops_min) &&
+          (innerloop_cnt <= device_param->kernel_loops_max))
+      {
+        device_param->kernel_loops_max = innerloop_cnt;
+      }
+    }
+  }
+}
+
+static int get_cuda_kernel_wgs (hashcat_ctx_t *hashcat_ctx, CUfunction function, u32 *result)
+{
+  int max_threads_per_block;
+
+  if (hc_cuFuncGetAttribute (hashcat_ctx, &max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function) == -1) return -1;
+
+  *result = (u32) max_threads_per_block;
+
+  return 0;
+}
+
+static int get_hip_kernel_wgs (hashcat_ctx_t *hashcat_ctx, HIPfunction function, u32 *result)
+{
+  int max_threads_per_block;
+
+  if (hc_hipFuncGetAttribute (hashcat_ctx, &max_threads_per_block, HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function) == -1) return -1;
+
+  *result = (u32) max_threads_per_block;
+
+  return 0;
+}
+
+static int get_cuda_kernel_local_mem_size (hashcat_ctx_t *hashcat_ctx, CUfunction function, u64 *result)
+{
+  int shared_size_bytes;
+
+  if (hc_cuFuncGetAttribute (hashcat_ctx, &shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, function) == -1) return -1;
+
+  *result = (u64) shared_size_bytes;
+
+  return 0;
+}
+
+static int get_hip_kernel_local_mem_size (hashcat_ctx_t *hashcat_ctx, HIPfunction function, u64 *result)
+{
+  int shared_size_bytes;
+
+  if (hc_hipFuncGetAttribute (hashcat_ctx, &shared_size_bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, function) == -1) return -1;
+
+  *result = (u64) shared_size_bytes;
+
+  return 0;
+}
+
+static int get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx, CUfunction function, u64 *result)
+{
+  // AFAIK there's no way to query the maximum value for dynamic shared memory available (because it depends on kernel code).
+  // let's brute force it, therefore workaround the hashcat wrapper of cuFuncSetAttribute()
+
+  #define MAX_ASSUMED_SHARED (1024 * 1024)
+
+  u64 dynamic_shared_size_bytes = 0;
+
+  for (int i = 1; i <= MAX_ASSUMED_SHARED; i++)
+  {
+    backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+    CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+    const CUresult CU_err = cuda->cuFuncSetAttribute (function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, i);
+
+    if (CU_err == CUDA_SUCCESS)
+    {
+      dynamic_shared_size_bytes = i;
+
+      continue;
+    }
+
+    break;
+  }
+
+  *result = dynamic_shared_size_bytes;
+
+  if (hc_cuFuncSetAttribute (hashcat_ctx, function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 0) == -1) return -1;
+
+  return 0;
+}
+
+static int get_hip_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx, HIPfunction function, u64 *result)
+{
+  // AFAIK there's no way to query the maximum value for dynamic shared memory available (because it depends on kernel code).
+  // let's brute force it, therefore workaround the hashcat wrapper of hipFuncSetAttribute()
+
+  #define MAX_ASSUMED_SHARED (1024 * 1024)
+
+  u64 dynamic_shared_size_bytes = 0;
+
+  for (int i = 1; i <= MAX_ASSUMED_SHARED; i++)
+  {
+    backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+    HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+    const HIPresult HIP_err = hip->hipFuncSetAttribute (function, HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, i);
+
+    if (HIP_err == HIP_SUCCESS)
+    {
+      dynamic_shared_size_bytes = i;
+
+      continue;
+    }
+
+    break;
+  }
+
+  *result = dynamic_shared_size_bytes;
+
+  if (hc_hipFuncSetAttribute (hashcat_ctx, function, HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 0) == -1) return -1;
+
+  return 0;
+}
+
+static int get_opencl_kernel_wgs (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u32 *result)
+{
+  size_t work_group_size = 0;
+
+  if (hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (work_group_size), &work_group_size, NULL) == -1) return -1;
+
+  u32 kernel_threads = (u32) work_group_size;
+
+  size_t compile_work_group_size[3] = { 0, 0, 0 };
+
+  if (hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof (compile_work_group_size), &compile_work_group_size, NULL) == -1) return -1;
+
+  const size_t cwgs_total = compile_work_group_size[0] * compile_work_group_size[1] * compile_work_group_size[2];
+
+  if (cwgs_total > 0)
+  {
+    kernel_threads = MIN (kernel_threads, (u32) cwgs_total);
+  }
+
+  *result = kernel_threads;
+
+  return 0;
+}
+
+static int get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u32 *result)
+{
+  size_t preferred_work_group_size_multiple = 0;
+
+  if (hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof (preferred_work_group_size_multiple), &preferred_work_group_size_multiple, NULL) == -1) return -1;
+
+  *result = (u32) preferred_work_group_size_multiple;
+
+  return 0;
+}
+
+static int get_opencl_kernel_local_mem_size (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u64 *result)
+{
+  cl_ulong local_mem_size = 0;
+
+  if (hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof (local_mem_size), &local_mem_size, NULL) == -1) return -1;
+
+  *result = local_mem_size;
+
+  return 0;
+}
+
+static int get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u64 *result)
+{
+  cl_ulong dynamic_local_mem_size = 0;
+
+  if (hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof (dynamic_local_mem_size), &dynamic_local_mem_size, NULL) == -1) return -1;
+
+  // unknown how to query this information in OpenCL
+  // we therefore reset to zero
+  // the above call to hc_clGetKernelWorkGroupInfo() is just to avoid compiler warnings
+
+  dynamic_local_mem_size = 0;
+
+  *result = dynamic_local_mem_size;
+
+  return 0;
+}
+
+static u32 get_kernel_threads (const hc_device_param_t *device_param)
+{
+  // this is an upper limit, a good start, since our strategy is to reduce thread counts only.
+
+  u32 kernel_threads_min = device_param->kernel_threads_min;
+  u32 kernel_threads_max = device_param->kernel_threads_max;
+
+  // the changes we do here are just optimizations, since the module always has priority.
+
+  const u32 device_maxworkgroup_size = (const u32) device_param->device_maxworkgroup_size;
+
+  kernel_threads_max = MIN (kernel_threads_max, device_maxworkgroup_size);
+
+  if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+  {
+    // for all CPU we just do 1 ...
+
+    const u32 cpu_prefered_thread_count = 1;
+
+    kernel_threads_max = MIN (kernel_threads_max, cpu_prefered_thread_count);
+  }
+  else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+  {
+    // for GPU we need to distinguish by vendor
+
+    if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK)
+    {
+      const u32 gpu_prefered_thread_count = 8;
+
+      kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count);
+    }
+    else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
+    {
+      const u32 gpu_prefered_thread_count = 64;
+
+      kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count);
+    }
+  }
+
+  // this is intenionally! at this point, kernel_threads_min can be higher than kernel_threads_max.
+  // in this case we actually want kernel_threads_min selected.
+
+  const u32 kernel_threads = MAX (kernel_threads_min, kernel_threads_max);
+
+  return kernel_threads;
+}
+
+static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const char *kernel_name, char *source_file, char *cached_file, const char *build_options_buf, const bool cache_disable, cl_program *opencl_program, CUmodule *cuda_module, HIPmodule *hip_module)
+{
+  const hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
+
+  bool cached = true;
+
+  if (cache_disable == true)
+  {
+    cached = false;
+  }
+
+  if (hc_path_read (cached_file) == false)
+  {
+    cached = false;
+  }
+
+  if (hc_path_is_empty (cached_file) == true)
+  {
+    cached = false;
+  }
+
+  /**
+   * kernel compile or load
+   */
+
+  size_t kernel_lengths_buf = 0;
+
+  size_t *kernel_lengths = &kernel_lengths_buf;
+
+  char *kernel_sources_buf = NULL;
+
+  char **kernel_sources = &kernel_sources_buf;
+
+  if (cached == false)
+  {
+    #if defined (DEBUG)
+    const user_options_t *user_options = hashcat_ctx->user_options;
+
+    if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Kernel %s not found in cache! Building may take a while...", device_param->device_id + 1, filename_from_filepath (cached_file));
+    #endif
+
+    if (read_kernel_binary (hashcat_ctx, source_file, kernel_lengths, kernel_sources) == false) return false;
+
+    if (device_param->is_cuda == true)
+    {
+      nvrtcProgram program;
+
+      if (hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], kernel_name, 0, NULL, NULL) == -1) return false;
+
+      char **nvrtc_options = (char **) hccalloc (4 + strlen (build_options_buf) + 1, sizeof (char *)); // ...
+
+      nvrtc_options[0] = "--restrict";
+      nvrtc_options[1] = "--device-as-default-execution-space";
+      nvrtc_options[2] = "--gpu-architecture";
+
+      hc_asprintf (&nvrtc_options[3], "compute_%d%d", device_param->sm_major, device_param->sm_minor);
+
+      char *nvrtc_options_string = hcstrdup (build_options_buf);
+
+      const int num_options = 4 + nvrtc_make_options_array_from_string (nvrtc_options_string, nvrtc_options + 4);
+
+      const int rc_nvrtcCompileProgram = hc_nvrtcCompileProgram (hashcat_ctx, program, num_options, (const char * const *) nvrtc_options);
+
+      size_t build_log_size = 0;
+
+      hc_nvrtcGetProgramLogSize (hashcat_ctx, program, &build_log_size);
+
+      #if defined (DEBUG)
+      if ((build_log_size > 1) || (rc_nvrtcCompileProgram == -1))
+      #else
+      if (rc_nvrtcCompileProgram == -1)
+      #endif
+      {
+        char *build_log = (char *) hcmalloc (build_log_size + 1);
+
+        if (hc_nvrtcGetProgramLog (hashcat_ctx, program, build_log) == -1) return false;
+
+        puts (build_log);
+
+        hcfree (build_log);
+      }
+
+      if (rc_nvrtcCompileProgram == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
+
+        return false;
+      }
+
+      hcfree (nvrtc_options);
+      hcfree (nvrtc_options_string);
+
+      size_t binary_size = 0;
+
+      if (hc_nvrtcGetPTXSize (hashcat_ctx, program, &binary_size) == -1) return false;
+
+      char *binary = (char *) hcmalloc (binary_size);
+
+      if (hc_nvrtcGetPTX (hashcat_ctx, program, binary) == -1) return false;
+
+      if (hc_nvrtcDestroyProgram (hashcat_ctx, &program) == -1) return false;
+
+      #define LOG_SIZE 8192
+
+      char *mod_info_log  = (char *) hcmalloc (LOG_SIZE + 1);
+      char *mod_error_log = (char *) hcmalloc (LOG_SIZE + 1);
+
+      int mod_cnt = 6;
+
+      CUjit_option mod_opts[7];
+      void *mod_vals[7];
+
+      mod_opts[0] = CU_JIT_TARGET_FROM_CUCONTEXT;
+      mod_vals[0] = (void *) 0;
+
+      mod_opts[1] = CU_JIT_LOG_VERBOSE;
+      mod_vals[1] = (void *) 1;
+
+      mod_opts[2] = CU_JIT_INFO_LOG_BUFFER;
+      mod_vals[2] = (void *) mod_info_log;
+
+      mod_opts[3] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+      mod_vals[3] = (void *) LOG_SIZE;
+
+      mod_opts[4] = CU_JIT_ERROR_LOG_BUFFER;
+      mod_vals[4] = (void *) mod_error_log;
+
+      mod_opts[5] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+      mod_vals[5] = (void *) LOG_SIZE;
+
+      if (hashconfig->opti_type & OPTI_TYPE_REGISTER_LIMIT)
+      {
+        mod_opts[6] = CU_JIT_MAX_REGISTERS;
+        mod_vals[6] = (void *) 128;
+
+        mod_cnt++;
+      }
+
+      #if defined (WITH_CUBIN)
+
+      char *jit_info_log  = (char *) hcmalloc (LOG_SIZE + 1);
+      char *jit_error_log = (char *) hcmalloc (LOG_SIZE + 1);
+
+      int jit_cnt = 6;
+
+      CUjit_option jit_opts[7];
+      void *jit_vals[7];
+
+      jit_opts[0] = CU_JIT_TARGET_FROM_CUCONTEXT;
+      jit_vals[0] = (void *) 0;
+
+      jit_opts[1] = CU_JIT_LOG_VERBOSE;
+      jit_vals[1] = (void *) 1;
+
+      jit_opts[2] = CU_JIT_INFO_LOG_BUFFER;
+      jit_vals[2] = (void *) jit_info_log;
+
+      jit_opts[3] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+      jit_vals[3] = (void *) LOG_SIZE;
+
+      jit_opts[4] = CU_JIT_ERROR_LOG_BUFFER;
+      jit_vals[4] = (void *) jit_error_log;
+
+      jit_opts[5] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+      jit_vals[5] = (void *) LOG_SIZE;
+
+      if (hashconfig->opti_type & OPTI_TYPE_REGISTER_LIMIT)
+      {
+        jit_opts[6] = CU_JIT_MAX_REGISTERS;
+        jit_vals[6] = (void *) 128;
+
+        jit_cnt++;
+      }
+
+      CUlinkState state;
+
+      if (hc_cuLinkCreate (hashcat_ctx, jit_cnt, jit_opts, jit_vals, &state) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s link failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", jit_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      if (hc_cuLinkAddData (hashcat_ctx, state, CU_JIT_INPUT_PTX, binary, binary_size, kernel_name, 0, NULL, NULL) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s link failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", jit_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      void *cubin = NULL;
+
+      size_t cubin_size = 0;
+
+      if (hc_cuLinkComplete (hashcat_ctx, state, &cubin, &cubin_size) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s link failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", jit_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      #if defined (DEBUG)
+      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s link successful. Info Log:", device_param->device_id + 1, source_file);
+      event_log_info (hashcat_ctx, "%s", jit_info_log);
+      event_log_info (hashcat_ctx, NULL);
+      #endif
+
+      if (hc_cuModuleLoadDataEx (hashcat_ctx, cuda_module, cubin, mod_cnt, mod_opts, mod_vals) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s load failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", mod_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      #if defined (DEBUG)
+      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s load successful. Info Log:", device_param->device_id + 1, source_file);
+      event_log_info (hashcat_ctx, "%s", mod_info_log);
+      event_log_info (hashcat_ctx, NULL);
+      #endif
+
+      if (cache_disable == false)
+      {
+        if (write_kernel_binary (hashcat_ctx, cached_file, cubin, cubin_size) == false) return false;
+      }
+
+      if (hc_cuLinkDestroy (hashcat_ctx, state) == -1) return false;
+
+      hcfree (jit_info_log);
+      hcfree (jit_error_log);
+
+      #else
+
+      if (hc_cuModuleLoadDataEx (hashcat_ctx, cuda_module, binary, mod_cnt, mod_opts, mod_vals) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s load failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", mod_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      #if defined (DEBUG)
+      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s load successful. Info Log:", device_param->device_id + 1, source_file);
+      event_log_info (hashcat_ctx, "%s", mod_info_log);
+      event_log_info (hashcat_ctx, NULL);
+      #endif
+
+      if (cache_disable == false)
+      {
+        if (write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size) == false) return false;
+      }
+
+      #endif
+
+      hcfree (mod_info_log);
+      hcfree (mod_error_log);
+
+      hcfree (binary);
+    }
+
+    /*
+    * HIP
+    */
+    if (device_param->is_hip == true)
+    {
+      hiprtcProgram program;
+
+      if (hc_hiprtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], kernel_name, 0, NULL, NULL) == -1) return false;
+
+      char **hiprtc_options = (char **) hccalloc (4 + strlen (build_options_buf) + 1, sizeof (char *)); // ...
+
+      hiprtc_options[0] = "";
+      hiprtc_options[1] = "";
+      hiprtc_options[2] = "";
+
+      hc_asprintf (&hiprtc_options[3], " ");
+
+      char *hiprtc_options_string = hcstrdup (build_options_buf);
+
+      const int num_options = 4 + hiprtc_make_options_array_from_string (hiprtc_options_string, hiprtc_options + 4);
+
+      const int rc_hiprtcCompileProgram = hc_hiprtcCompileProgram (hashcat_ctx, program, num_options, (const char * const *) hiprtc_options);
+
+      size_t build_log_size = 0;
+
+      hc_hiprtcGetProgramLogSize (hashcat_ctx, program, &build_log_size);
+
+      #if defined (DEBUG)
+      if ((build_log_size > 1) || (rc_hiprtcCompileProgram == -1))
+      #else
+      if (rc_hiprtcCompileProgram == -1)
+      #endif
+      {
+        char *build_log = (char *) hcmalloc (build_log_size + 1);
+
+        if (hc_hiprtcGetProgramLog (hashcat_ctx, program, build_log) == -1) return false;
+
+        puts (build_log);
+
+        hcfree (build_log);
+      }
+
+      if (rc_hiprtcCompileProgram == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
+
+        return false;
+      }
+
+      hcfree (hiprtc_options);
+      hcfree (hiprtc_options_string);
+
+      size_t binary_size = 0;
+
+      if (hc_hiprtcGetCodeSize (hashcat_ctx, program, &binary_size) == -1) return false;
+
+      char *binary = (char *) hcmalloc (binary_size);
+
+      if (hc_hiprtcGetCode (hashcat_ctx, program, binary) == -1) return false;
+
+      if (hc_hiprtcDestroyProgram (hashcat_ctx, &program) == -1) return false;
+
+      #define LOG_SIZE 8192
+
+      char *mod_info_log  = (char *) hcmalloc (LOG_SIZE + 1);
+      char *mod_error_log = (char *) hcmalloc (LOG_SIZE + 1);
+
+      int mod_cnt = 6;
+
+      HIPjit_option mod_opts[7];
+      void *mod_vals[7];
+
+      mod_opts[0] = HIP_JIT_TARGET_FROM_HIPCONTEXT;
+      mod_vals[0] = (void *) 0;
+
+      mod_opts[1] = HIP_JIT_LOG_VERBOSE;
+      mod_vals[1] = (void *) 1;
+
+      mod_opts[2] = HIP_JIT_INFO_LOG_BUFFER;
+      mod_vals[2] = (void *) mod_info_log;
+
+      mod_opts[3] = HIP_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+      mod_vals[3] = (void *) LOG_SIZE;
+
+      mod_opts[4] = HIP_JIT_ERROR_LOG_BUFFER;
+      mod_vals[4] = (void *) mod_error_log;
+
+      mod_opts[5] = HIP_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+      mod_vals[5] = (void *) LOG_SIZE;
+
+      if (hashconfig->opti_type & OPTI_TYPE_REGISTER_LIMIT)
+      {
+        mod_opts[6] = HIP_JIT_MAX_REGISTERS;
+        mod_vals[6] = (void *) 128;
+
+        mod_cnt++;
+      }
+
+      #if defined (WITH_HIPBIN)
+
+      char *jit_info_log  = (char *) hcmalloc (LOG_SIZE + 1);
+      char *jit_error_log = (char *) hcmalloc (LOG_SIZE + 1);
+
+      int jit_cnt = 6;
+
+      HIPjit_option jit_opts[7];
+      void *jit_vals[7];
+
+      jit_opts[0] = HIP_JIT_TARGET_FROM_HIPCONTEXT;
+      jit_vals[0] = (void *) 0;
+
+      jit_opts[1] = HIP_JIT_LOG_VERBOSE;
+      jit_vals[1] = (void *) 1;
+
+      jit_opts[2] = HIP_JIT_INFO_LOG_BUFFER;
+      jit_vals[2] = (void *) jit_info_log;
+
+      jit_opts[3] = HIP_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+      jit_vals[3] = (void *) LOG_SIZE;
+
+      jit_opts[4] = HIP_JIT_ERROR_LOG_BUFFER;
+      jit_vals[4] = (void *) jit_error_log;
+
+      jit_opts[5] = HIP_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+      jit_vals[5] = (void *) LOG_SIZE;
+
+      if (hashconfig->opti_type & OPTI_TYPE_REGISTER_LIMIT)
+      {
+        jit_opts[6] = HIP_JIT_MAX_REGISTERS;
+        jit_vals[6] = (void *) 128;
+
+        jit_cnt++;
+      }
+
+      HIPlinkState state;
+
+      if (hc_hipLinkCreate (hashcat_ctx, jit_cnt, jit_opts, jit_vals, &state) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s link failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", jit_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      if (hc_hipLinkAddData (hashcat_ctx, state, HIP_JIT_INPUT_PTX, binary, binary_size, kernel_name, 0, NULL, NULL) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s link failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", jit_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      void *hipbin = NULL;
+
+      size_t hipbin_size = 0;
+
+      if (hc_hipLinkComplete (hashcat_ctx, state, &hipbin, &hipbin_size) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s link failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", jit_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      #if defined (DEBUG)
+      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s link successful. Info Log:", device_param->device_id + 1, source_file);
+      event_log_info (hashcat_ctx, "%s", jit_info_log);
+      event_log_info (hashcat_ctx, NULL);
+      #endif
+
+      if (hc_hipModuleLoadDataEx (hashcat_ctx, hip_module, hipbin, mod_cnt, mod_opts, mod_vals) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s load failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", mod_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      #if defined (DEBUG)
+      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s load successful. Info Log:", device_param->device_id + 1, source_file);
+      event_log_info (hashcat_ctx, "%s", mod_info_log);
+      event_log_info (hashcat_ctx, NULL);
+      #endif
+
+      if (cache_disable == false)
+      {
+        if (write_kernel_binary (hashcat_ctx, cached_file, hipbin, hipbin_size) == false) return false;
+      }
+
+      if (hc_hipLinkDestroy (hashcat_ctx, state) == -1) return false;
+
+      hcfree (jit_info_log);
+      hcfree (jit_error_log);
+
+      #else
+
+      if (hc_hipModuleLoadDataEx (hashcat_ctx, hip_module, binary, mod_cnt, mod_opts, mod_vals) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s load failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", mod_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      #if defined (DEBUG)
+      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s load successful. Info Log:", device_param->device_id + 1, source_file);
+      event_log_info (hashcat_ctx, "%s", mod_info_log);
+      event_log_info (hashcat_ctx, NULL);
+      #endif
+
+      if (cache_disable == false)
+      {
+        if (write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size) == false) return false;
+      }
+
+      #endif
+
+      hcfree (mod_info_log);
+      hcfree (mod_error_log);
+
+      hcfree (binary);
+    }
+    
+    /*
+    * OCL
+    */
+    if (device_param->is_opencl == true)
+    {
+      if (hc_clCreateProgramWithSource (hashcat_ctx, device_param->opencl_context, 1, (const char **) kernel_sources, NULL, opencl_program) == -1) return false;
+
+      const int CL_rc = hc_clBuildProgram (hashcat_ctx, *opencl_program, 1, &device_param->opencl_device, build_options_buf, NULL, NULL);
+
+      //if (CL_rc == -1) return -1;
+
+      size_t build_log_size = 0;
+
+      hc_clGetProgramBuildInfo (hashcat_ctx, *opencl_program, device_param->opencl_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
+
+      //if (CL_rc == -1) return -1;
+
+      #if defined (DEBUG)
+      if ((build_log_size > 1) || (CL_rc == -1))
+      #else
+      if (CL_rc == -1)
+      #endif
+      {
+        char *build_log = (char *) hcmalloc (build_log_size + 1);
+
+        const int rc_clGetProgramBuildInfo = hc_clGetProgramBuildInfo (hashcat_ctx, *opencl_program, device_param->opencl_device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
+
+        if (rc_clGetProgramBuildInfo == -1) return false;
+
+        puts (build_log);
+
+        hcfree (build_log);
+      }
+
+      if (CL_rc == -1) return false;
+
+      if (cache_disable == false)
+      {
+        size_t binary_size;
+
+        if (hc_clGetProgramInfo (hashcat_ctx, *opencl_program, CL_PROGRAM_BINARY_SIZES, sizeof (size_t), &binary_size, NULL) == -1) return false;
+
+        char *binary = (char *) hcmalloc (binary_size);
+
+        if (hc_clGetProgramInfo (hashcat_ctx, *opencl_program, CL_PROGRAM_BINARIES, sizeof (char *), &binary, NULL) == -1) return false;
+
+        if (write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size) == false) return false;
+
+        hcfree (binary);
+      }
+    }
+  }
+  else
+  {
+    if (read_kernel_binary (hashcat_ctx, cached_file, kernel_lengths, kernel_sources) == false) return false;
+
+    if (device_param->is_cuda == true)
+    {
+      #define LOG_SIZE 8192
+
+      char *mod_info_log  = (char *) hcmalloc (LOG_SIZE + 1);
+      char *mod_error_log = (char *) hcmalloc (LOG_SIZE + 1);
+
+      int mod_cnt = 6;
+
+      CUjit_option mod_opts[7];
+      void *mod_vals[7];
+
+      mod_opts[0] = CU_JIT_TARGET_FROM_CUCONTEXT;
+      mod_vals[0] = (void *) 0;
+
+      mod_opts[1] = CU_JIT_LOG_VERBOSE;
+      mod_vals[1] = (void *) 1;
+
+      mod_opts[2] = CU_JIT_INFO_LOG_BUFFER;
+      mod_vals[2] = (void *) mod_info_log;
+
+      mod_opts[3] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+      mod_vals[3] = (void *) LOG_SIZE;
+
+      mod_opts[4] = CU_JIT_ERROR_LOG_BUFFER;
+      mod_vals[4] = (void *) mod_error_log;
+
+      mod_opts[5] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+      mod_vals[5] = (void *) LOG_SIZE;
+
+      if (hashconfig->opti_type & OPTI_TYPE_REGISTER_LIMIT)
+      {
+        mod_opts[6] = CU_JIT_MAX_REGISTERS;
+        mod_vals[6] = (void *) 128;
+
+        mod_cnt++;
+      }
+
+      if (hc_cuModuleLoadDataEx (hashcat_ctx, cuda_module, kernel_sources[0], mod_cnt, mod_opts, mod_vals) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s load failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", mod_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      #if defined (DEBUG)
+      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s load successful. Info Log:", device_param->device_id + 1, source_file);
+      event_log_info (hashcat_ctx, "%s", mod_info_log);
+      event_log_info (hashcat_ctx, NULL);
+      #endif
+
+      hcfree (mod_info_log);
+      hcfree (mod_error_log);
+    }
+
+    /*
+    * HIP
+    */
+    if (device_param->is_hip == true)
+    {
+      #define LOG_SIZE 8192
+
+      char *mod_info_log  = (char *) hcmalloc (LOG_SIZE + 1);
+      char *mod_error_log = (char *) hcmalloc (LOG_SIZE + 1);
+
+      int mod_cnt = 6;
+
+      HIPjit_option mod_opts[7];
+      void *mod_vals[7];
+
+      mod_opts[0] = HIP_JIT_TARGET_FROM_HIPCONTEXT;
+      mod_vals[0] = (void *) 0;
+
+      mod_opts[1] = HIP_JIT_LOG_VERBOSE;
+      mod_vals[1] = (void *) 1;
+
+      mod_opts[2] = HIP_JIT_INFO_LOG_BUFFER;
+      mod_vals[2] = (void *) mod_info_log;
+
+      mod_opts[3] = HIP_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+      mod_vals[3] = (void *) LOG_SIZE;
+
+      mod_opts[4] = HIP_JIT_ERROR_LOG_BUFFER;
+      mod_vals[4] = (void *) mod_error_log;
+
+      mod_opts[5] = HIP_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+      mod_vals[5] = (void *) LOG_SIZE;
+
+      if (hashconfig->opti_type & OPTI_TYPE_REGISTER_LIMIT)
+      {
+        mod_opts[6] = HIP_JIT_MAX_REGISTERS;
+        mod_vals[6] = (void *) 128;
+
+        mod_cnt++;
+      }
+
+      if (hc_hipModuleLoadDataEx (hashcat_ctx, hip_module, kernel_sources[0], mod_cnt, mod_opts, mod_vals) == -1)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s load failed. Error Log:", device_param->device_id + 1, source_file);
+        event_log_error (hashcat_ctx, "%s", mod_error_log);
+        event_log_error (hashcat_ctx, NULL);
+
+        return false;
+      }
+
+      #if defined (DEBUG)
+      event_log_info (hashcat_ctx, "* Device #%u: Kernel %s Ctx %p load successful. Info Log:", device_param->device_id + 1, source_file, device_param->hip_context);
+      event_log_info (hashcat_ctx, "%s", mod_info_log);
+      event_log_info (hashcat_ctx, NULL);
+      #endif
+
+      hcfree (mod_info_log);
+      hcfree (mod_error_log);
+    }
+
+    /*
+    * OCL
+    */
+    if (device_param->is_opencl == true)
+    {
+      if (hc_clCreateProgramWithBinary (hashcat_ctx, device_param->opencl_context, 1, &device_param->opencl_device, kernel_lengths, (const unsigned char **) kernel_sources, NULL, opencl_program) == -1) return false;
+
+      if (hc_clBuildProgram (hashcat_ctx, *opencl_program, 1, &device_param->opencl_device, build_options_buf, NULL, NULL) == -1) return false;
+    }
+  }
+
+  hcfree (kernel_sources[0]);
+
+  return true;
+}
+
+int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
+{
+  const bitmap_ctx_t         *bitmap_ctx          = hashcat_ctx->bitmap_ctx;
+  const folder_config_t      *folder_config       = hashcat_ctx->folder_config;
+  const hashconfig_t         *hashconfig          = hashcat_ctx->hashconfig;
+  const hashes_t             *hashes              = hashcat_ctx->hashes;
+  const module_ctx_t         *module_ctx          = hashcat_ctx->module_ctx;
+        backend_ctx_t        *backend_ctx         = hashcat_ctx->backend_ctx;
+  const straight_ctx_t       *straight_ctx        = hashcat_ctx->straight_ctx;
+  const user_options_extra_t *user_options_extra  = hashcat_ctx->user_options_extra;
+  const user_options_t       *user_options        = hashcat_ctx->user_options;
+
+  if (backend_ctx->enabled == false) return 0;
+
+  u64 size_total_host_all = 0;
+
+  u32 hardware_power_all = 0;
+
+  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  {
+    /**
+     * host buffer
+     */
+
+    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+    if (device_param->skipped == true) continue;
+
+    EVENT_DATA (EVENT_BACKEND_DEVICE_INIT_PRE, &backend_devices_idx, sizeof (int));
+
+    const int device_id = device_param->device_id;
+
+    /**
+     * module depending checks
+     */
+
+    device_param->skipped_warning = false;
+
+    if (module_ctx->module_unstable_warning != MODULE_DEFAULT)
+    {
+      const bool unstable_warning = module_ctx->module_unstable_warning (hashconfig, user_options, user_options_extra, device_param);
+
+      if ((unstable_warning == true) && (user_options->force == false))
+      {
+        event_log_warning (hashcat_ctx, "* Device #%u: Skipping hash-mode %u - known CUDA/OpenCL Runtime/Driver issue (not a hashcat issue)", device_id + 1, hashconfig->hash_mode);
+        event_log_warning (hashcat_ctx, "             You can use --force to override, but do not report related errors.");
+
+        device_param->skipped_warning = true;
+
+        continue;
+      }
+    }
+
+    // vector_width
+
+    int vector_width = 0;
+
+    if (user_options->backend_vector_width_chgd == false)
+    {
+      // tuning db
+
+      tuning_db_entry_t *tuningdb_entry;
+
+      if (user_options->slow_candidates == true)
+      {
+        tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, 0, hashconfig->hash_mode);
+      }
+      else
+      {
+        tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
+      }
+
+      if (tuningdb_entry == NULL || tuningdb_entry->vector_width == -1)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_USES_BITS_64)
+        {
+          if (device_param->is_cuda == true)
+          {
+            // cuda does not support this query
+
+            vector_width = 1;
+          }
+
+          if (device_param->is_hip == true)
+          {
+            vector_width = 1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, sizeof (vector_width), &vector_width, NULL) == -1) return -1;
+          }
+        }
+        else
+        {
+          if (device_param->is_cuda == true)
+          {
+            // cuda does not support this query
+
+            vector_width = 1;
+          }
+
+          if (device_param->is_hip == true)
+          {
+            vector_width = 1;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT,  sizeof (vector_width), &vector_width, NULL) == -1) return -1;
+          }
+        }
+      }
+      else
+      {
+        vector_width = (cl_uint) tuningdb_entry->vector_width;
+      }
+    }
+    else
+    {
+      vector_width = user_options->backend_vector_width;
+    }
+
+    // We can't have SIMD in kernels where we have an unknown final password length
+    // It also turns out that pure kernels (that have a higher register pressure)
+    // actually run faster on scalar GPU (like 1080) without SIMD
+
+    if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
+    {
+      if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+      {
+        vector_width = 1;
+      }
+    }
+
+    if (vector_width > 16) vector_width = 16;
+
+    device_param->vector_width = vector_width;
+
+    /**
+     * kernel accel and loops tuning db adjustment
+     */
+
+    device_param->kernel_accel_min   = hashconfig->kernel_accel_min;
+    device_param->kernel_accel_max   = hashconfig->kernel_accel_max;
+    device_param->kernel_loops_min   = hashconfig->kernel_loops_min;
+    device_param->kernel_loops_max   = hashconfig->kernel_loops_max;
+    device_param->kernel_threads_min = hashconfig->kernel_threads_min;
+    device_param->kernel_threads_max = hashconfig->kernel_threads_max;
+
+    tuning_db_entry_t *tuningdb_entry = NULL;
+
+    if (user_options->slow_candidates == true)
+    {
+      tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, 0, hashconfig->hash_mode);
+    }
+    else
+    {
+      tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
+    }
+
+    // user commandline option override tuning db
+    // but both have to stay inside the boundaries of the module
+
+    if (user_options->kernel_accel_chgd == true)
+    {
+      const u32 _kernel_accel = user_options->kernel_accel;
+
+      if ((_kernel_accel >= device_param->kernel_accel_min) && (_kernel_accel <= device_param->kernel_accel_max))
+      {
+        device_param->kernel_accel_min = _kernel_accel;
+        device_param->kernel_accel_max = _kernel_accel;
+      }
+    }
+    else
+    {
+      if (tuningdb_entry != NULL)
+      {
+        const u32 _kernel_accel = tuningdb_entry->kernel_accel;
+
+        if (_kernel_accel)
+        {
+          if ((_kernel_accel >= device_param->kernel_accel_min) && (_kernel_accel <= device_param->kernel_accel_max))
+          {
+            device_param->kernel_accel_min = _kernel_accel;
+            device_param->kernel_accel_max = _kernel_accel;
+          }
+        }
+      }
+    }
+
+    if (user_options->kernel_loops_chgd == true)
+    {
+      const u32 _kernel_loops = user_options->kernel_loops;
+
+      if ((_kernel_loops >= device_param->kernel_loops_min) && (_kernel_loops <= device_param->kernel_loops_max))
+      {
+        device_param->kernel_loops_min = _kernel_loops;
+        device_param->kernel_loops_max = _kernel_loops;
+      }
+    }
+    else
+    {
+      if (tuningdb_entry != NULL)
+      {
+        u32 _kernel_loops = tuningdb_entry->kernel_loops;
+
+        if (_kernel_loops)
+        {
+          if (user_options->workload_profile == 1)
+          {
+            _kernel_loops = (_kernel_loops > 8) ? _kernel_loops / 8 : 1;
+          }
+          else if (user_options->workload_profile == 2)
+          {
+            _kernel_loops = (_kernel_loops > 4) ? _kernel_loops / 4 : 1;
+          }
+
+          if ((_kernel_loops >= device_param->kernel_loops_min) && (_kernel_loops <= device_param->kernel_loops_max))
+          {
+            device_param->kernel_loops_min = _kernel_loops;
+            device_param->kernel_loops_max = _kernel_loops;
+          }
+        }
+      }
+    }
+
+    // there's no thread column in tuning db, stick to commandline if defined
+
+    if (user_options->kernel_threads_chgd == true)
+    {
+      const u32 _kernel_threads = user_options->kernel_threads;
+
+      if ((_kernel_threads >= device_param->kernel_threads_min) && (_kernel_threads <= device_param->kernel_threads_max))
+      {
+        device_param->kernel_threads_min = _kernel_threads;
+        device_param->kernel_threads_max = _kernel_threads;
+      }
+    }
+
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      // we have some absolute limits for fast hashes (because of limit constant memory), make sure not to overstep
+
+      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+      {
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_RULES);
+          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_RULES);
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_COMBS);
+          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_COMBS);
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, KERNEL_BFS);
+          device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, KERNEL_BFS);
+        }
+      }
+    }
+
+    device_param->kernel_loops_min_sav = device_param->kernel_loops_min;
+    device_param->kernel_loops_max_sav = device_param->kernel_loops_max;
+
+    /**
+     * device properties
+     */
+
+    const u32 device_processors = device_param->device_processors;
+
+    /**
+     * create context for each device
+     */
+
+    if (device_param->is_cuda == true)
+    {
+      if (hc_cuCtxCreate (hashcat_ctx, &device_param->cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
+    }
+
+    if (device_param->is_hip == true)
+    {
+      if (hc_hipCtxCreate (hashcat_ctx, &device_param->hip_context, HIP_CTX_SCHED_BLOCKING_SYNC, device_param->hip_device) == -1) return -1;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      /*
+      cl_context_properties properties[3];
+
+      properties[0] = CL_CONTEXT_PLATFORM;
+      properties[1] = (cl_context_properties) device_param->opencl_platform;
+      properties[2] = 0;
+
+      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context);
+      */
+
+      if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context) == -1) return -1;
+
+      /**
+       * create command-queue
+       */
+
+      // not supported with NV
+      // device_param->opencl_command_queue = hc_clCreateCommandQueueWithProperties (hashcat_ctx, device_param->opencl_device, NULL);
+
+      if (hc_clCreateCommandQueue (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, CL_QUEUE_PROFILING_ENABLE, &device_param->opencl_command_queue) == -1) return -1;
+    }
+
+    /**
+     * create stream for CUDA devices
+     */
+
+    if (device_param->is_cuda == true)
+    {
+      if (hc_cuStreamCreate (hashcat_ctx, &device_param->cuda_stream, CU_STREAM_DEFAULT) == -1) return -1;
+    }
+
+    /**
+     * create stream for HIP devices
+     */
+
+    if (device_param->is_hip == true)
+    {
+      if (hc_hipStreamCreate (hashcat_ctx, &device_param->hip_stream, HIP_STREAM_DEFAULT) == -1) return -1;
+    }
+
+    /**
+     * create events for CUDA devices
+     */
+
+    if (device_param->is_cuda == true)
+    {
+      if (hc_cuEventCreate (hashcat_ctx, &device_param->cuda_event1, CU_EVENT_DEFAULT) == -1) return -1;
+
+      if (hc_cuEventCreate (hashcat_ctx, &device_param->cuda_event2, CU_EVENT_DEFAULT) == -1) return -1;
+    }
+
+    /**
+     * create events for HIP devices
+     */
+
+    if (device_param->is_hip == true)
+    {
+      if (hc_hipEventCreate (hashcat_ctx, &device_param->hip_event1, HIP_EVENT_DEFAULT) == -1) return -1;
+
+      if (hc_hipEventCreate (hashcat_ctx, &device_param->hip_event2, HIP_EVENT_DEFAULT) == -1) return -1;
+    }
+
+    /**
+     * create input buffers on device : calculate size of fixed memory buffers
+     */
+
+    u64 size_root_css   = SP_PW_MAX *           sizeof (cs_t);
+    u64 size_markov_css = SP_PW_MAX * CHARSIZ * sizeof (cs_t);
+
+    device_param->size_root_css   = size_root_css;
+    device_param->size_markov_css = size_markov_css;
+
+    u64 size_results = sizeof (u32);
+
+    device_param->size_results = size_results;
+
+    u64 size_rules   = (u64) straight_ctx->kernel_rules_cnt * sizeof (kernel_rule_t);
+    u64 size_rules_c = (u64) KERNEL_RULES                   * sizeof (kernel_rule_t);
+
+    device_param->size_rules    = size_rules;
+    device_param->size_rules_c  = size_rules_c;
+
+    u64 size_plains  = (u64) hashes->digests_cnt * sizeof (plain_t);
+    u64 size_salts   = (u64) hashes->salts_cnt   * sizeof (salt_t);
+    u64 size_esalts  = (u64) hashes->digests_cnt * hashconfig->esalt_size;
+    u64 size_shown   = (u64) hashes->digests_cnt * sizeof (u32);
+    u64 size_digests = (u64) hashes->digests_cnt * (u64) hashconfig->dgst_size;
+
+    device_param->size_plains   = size_plains;
+    device_param->size_digests  = size_digests;
+    device_param->size_shown    = size_shown;
+    device_param->size_salts    = size_salts;
+    device_param->size_esalts   = size_esalts;
+
+    u64 size_combs = KERNEL_COMBS * sizeof (pw_t);
+    u64 size_bfs   = KERNEL_BFS   * sizeof (bf_t);
+    u64 size_tm    = 32           * sizeof (bs_word_t);
+
+    device_param->size_bfs      = size_bfs;
+    device_param->size_combs    = size_combs;
+    device_param->size_tm       = size_tm;
+
+    u64 size_st_digests = 1 * hashconfig->dgst_size;
+    u64 size_st_salts   = 1 * sizeof (salt_t);
+    u64 size_st_esalts  = 1 * hashconfig->esalt_size;
+
+    device_param->size_st_digests = size_st_digests;
+    device_param->size_st_salts   = size_st_salts;
+    device_param->size_st_esalts  = size_st_esalts;
+
+    u64 size_extra_buffer = 4;
+
+    if (module_ctx->module_extra_buffer_size != MODULE_DEFAULT)
+    {
+      const u64 extra_buffer_size = module_ctx->module_extra_buffer_size (hashconfig, user_options, user_options_extra, hashes, device_param);
+
+      if (extra_buffer_size == (u64) -1)
+      {
+        event_log_error (hashcat_ctx, "Invalid extra buffer size.");
+
+        return -1;
+      }
+
+      device_param->extra_buffer_size = extra_buffer_size;
+
+      size_extra_buffer = extra_buffer_size;
+    }
+
+    // kern type
+
+    u32 kern_type = hashconfig->kern_type;
+
+    if (module_ctx->module_kern_type_dynamic != MODULE_DEFAULT)
+    {
+      if (user_options->benchmark == true)
+      {
+      }
+      else
+      {
+        void        *digests_buf    = hashes->digests_buf;
+        salt_t      *salts_buf      = hashes->salts_buf;
+        void        *esalts_buf     = hashes->esalts_buf;
+        void        *hook_salts_buf = hashes->hook_salts_buf;
+        hashinfo_t **hash_info      = hashes->hash_info;
+
+        hashinfo_t *hash_info_ptr = NULL;
+
+        if (hash_info) hash_info_ptr = hash_info[0];
+
+        kern_type = (u32) module_ctx->module_kern_type_dynamic (hashconfig, digests_buf, salts_buf, esalts_buf, hook_salts_buf, hash_info_ptr);
+      }
+    }
+
+    // built options
+
+    const size_t build_options_sz = 4096;
+
+    char *build_options_buf = (char *) hcmalloc (build_options_sz);
+
+    int build_options_len = 0;
+
+    #if defined (_WIN)
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D KERNEL_STATIC -I OpenCL -I \"%s\" ", folder_config->cpath_real);
+    #else
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D KERNEL_STATIC -I /opt/rocm/hip/include/hip/ -I OpenCL -I %s ", folder_config->cpath_real);
+    #endif
+
+    /* currently disabled, hangs NEO drivers since 20.09.
+       was required for NEO driver 20.08 to workaround the same issue!
+       we go with the latest version
+
+    if (device_param->is_opencl == true)
+    {
+      if (device_param->use_opencl12 == true)
+      {
+        build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-cl-std=CL1.2 ");
+      }
+      else if (device_param->use_opencl20 == true)
+      {
+        build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-cl-std=CL2.0 ");
+      }
+      else if (device_param->use_opencl21 == true)
+      {
+        build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-cl-std=CL2.1 ");
+      }
+    }
+    */
+
+    // we don't have sm_* on vendors not NV but it doesn't matter
+
+    #if defined (DEBUG)
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-O3 -march=native -minline-all-stringops -ftracer -funroll-loops -fprefetch-loop-arrays -ffast-math -fno-stack-protector -Wno-error -Wall -Wextra -Wshadow -Wno-unused-function -Wno-unused-parameter -Wno-unused-local-typedefs -Wno-unknown-pragmas -Wno-write-strings -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-invalid-constexpr -Wno-ignored-optimization-argument -Wno-unused-private-field -D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D amdgpu-target=gfx908 -D _XXX_CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
+    //build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
+    #else
+    //build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-mllvm -amdgpu-spill-vgpr-to-agpr=false -O3 -march=native -minline-all-stringops -ftracer -funroll-loops -fprefetch-loop-arrays -ffast-math -fno-stack-protector -Wno-error -Wall -Wextra -Wshadow -Wno-unused-function -Wno-unused-parameter -Wno-unused-local-typedefs -Wno-unknown-pragmas -Wno-write-strings -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-invalid-constexpr -Wno-ignored-optimization-argument -Wno-unused-private-field -D IS_HIP -D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D amdgpu-target=gfx908 -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
+    #endif
+
+    build_options_buf[build_options_len] = 0;
+
+    /*
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+    {
+      if (device_param->opencl_platform_vendor_id == VENDOR_ID_INTEL_SDK)
+      {
+        strncat (build_options_buf, " -cl-opt-disable", 16);
+      }
+    }
+    */
+
+    #if defined (DEBUG)
+    if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_options '%s'", device_id + 1, build_options_buf);
+    #endif
+
+    /**
+     * device_name_chksum
+     */
+
+    char *device_name_chksum        = (char *) hcmalloc (HCBUFSIZ_TINY);
+    char *device_name_chksum_amp_mp = (char *) hcmalloc (HCBUFSIZ_TINY);
+
+    device_param->vector_width = 8;
+    const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s-%d-%u",
+      backend_ctx->comptime,
+      backend_ctx->cuda_driver_version,
+      backend_ctx->hip_driver_version,
+      device_param->is_opencl,
+      device_param->opencl_platform_vendor_id,
+      device_param->device_name,
+      device_param->opencl_device_version,
+      device_param->opencl_driver_version,
+      device_param->vector_width,
+      hashconfig->kern_type);
+
+    const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s",
+      backend_ctx->comptime,
+      backend_ctx->cuda_driver_version,
+      backend_ctx->hip_driver_version,
+      device_param->is_opencl,
+      device_param->opencl_platform_vendor_id,
+      device_param->device_name,
+      device_param->opencl_device_version,
+      device_param->opencl_driver_version);
+
+    md5_ctx_t md5_ctx;
+
+    md5_init   (&md5_ctx);
+    md5_update (&md5_ctx, (u32 *) device_name_chksum, dnclen);
+    md5_final  (&md5_ctx);
+
+    snprintf (device_name_chksum, HCBUFSIZ_TINY, "%08x", md5_ctx.h[0]);
+
+    md5_init   (&md5_ctx);
+    md5_update (&md5_ctx, (u32 *) device_name_chksum_amp_mp, dnclen_amp_mp);
+    md5_final  (&md5_ctx);
+
+    snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%08x", md5_ctx.h[0]);
+
+    /**
+     * kernel cache
+     */
+
+    bool cache_disable = false;
+
+    // Seems to be completely broken on Apple + (Intel?) CPU
+    // To reproduce set cache_disable to false and run benchmark -b
+
+    if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
+    {
+      if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+      {
+        cache_disable = true;
+      }
+    }
+
+    if (module_ctx->module_jit_cache_disable != MODULE_DEFAULT)
+    {
+      cache_disable = module_ctx->module_jit_cache_disable (hashconfig, user_options, user_options_extra, hashes, device_param);
+    }
+
+    /**
+     * shared kernel with no hashconfig dependencies
+     */
+
+    {
+      /**
+       * kernel shared source filename
+       */
+
+      char source_file[256] = { 0 };
+
+      generate_source_kernel_shared_filename (folder_config->shared_dir, source_file);
+
+      if (hc_path_read (source_file) == false)
+      {
+        event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
+
+        return -1;
+      }
+
+      /**
+       * kernel shared cached filename
+       */
+
+      char cached_file[256] = { 0 };
+
+      generate_cached_kernel_shared_filename (folder_config->profile_dir, device_name_chksum_amp_mp, cached_file);
+
+      const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "shared_kernel", source_file, cached_file, build_options_buf, cache_disable, &device_param->opencl_program_shared, &device_param->cuda_module_shared, &device_param->hip_module_shared);
+
+      if (rc_load_kernel == false)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
+
+        return -1;
+      }
+
+      if (device_param->is_cuda == true)
+      {
+        // GPU memset
+
+        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_memset, device_param->cuda_module_shared, "gpu_memset") == -1) return -1;
+
+        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_wgs_memset) == -1) return -1;
+
+        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_local_mem_size_memset) == -1) return -1;
+
+        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_dynamic_local_mem_size_memset) == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple_memset = device_param->cuda_warp_size;
+
+        //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem),   device_param->kernel_params_memset[0]); if (CL_rc == -1) return -1;
+        //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1;
+        //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1;
+
+        // GPU autotune init
+
+        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_atinit, device_param->cuda_module_shared, "gpu_atinit") == -1) return -1;
+
+        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_wgs_atinit) == -1) return -1;
+
+        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_local_mem_size_atinit) == -1) return -1;
+
+        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_dynamic_local_mem_size_atinit) == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple_atinit = device_param->cuda_warp_size;
+
+        // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem),   device_param->kernel_params_atinit[0]); if (CL_rc == -1) return -1;
+        // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]); if (CL_rc == -1) return -1;
+
+        // GPU decompress
+
+        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_decompress, device_param->cuda_module_shared, "gpu_decompress") == -1) return -1;
+
+        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_wgs_decompress) == -1) return -1;
+
+        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_local_mem_size_decompress) == -1) return -1;
+
+        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_dynamic_local_mem_size_decompress) == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple_decompress = device_param->cuda_warp_size;
+      }
+
+      /*
+      * HIP
+      */
+      if (device_param->is_hip == true)
+      {
+        // GPU memset
+
+        if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_memset, device_param->hip_module_shared, "gpu_memset") == -1) return -1;
+
+        if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_memset, &device_param->kernel_wgs_memset) == -1) return -1;
+
+        if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_memset, &device_param->kernel_local_mem_size_memset) == -1) return -1;
+
+        if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_memset, &device_param->kernel_dynamic_local_mem_size_memset) == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple_memset = device_param->hip_warp_size;
+
+        //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem),   device_param->kernel_params_memset[0]); if (CL_rc == -1) return -1;
+        //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint),  device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1;
+        //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1;
+
+        // GPU autotune init
+
+        if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_atinit, device_param->hip_module_shared, "gpu_atinit") == -1) return -1;
+
+        if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_atinit, &device_param->kernel_wgs_atinit) == -1) return -1;
+
+        if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_atinit, &device_param->kernel_local_mem_size_atinit) == -1) return -1;
+
+        if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_atinit, &device_param->kernel_dynamic_local_mem_size_atinit) == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple_atinit = device_param->hip_warp_size;
+
+        // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem),   device_param->kernel_params_atinit[0]); if (CL_rc == -1) return -1;
+        // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]); if (CL_rc == -1) return -1;
+
+        // GPU decompress
+
+        if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_decompress, device_param->hip_module_shared, "gpu_decompress") == -1) return -1;
+
+        if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_decompress, &device_param->kernel_wgs_decompress) == -1) return -1;
+
+        if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_decompress, &device_param->kernel_local_mem_size_decompress) == -1) return -1;
+
+        if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_decompress, &device_param->kernel_dynamic_local_mem_size_decompress) == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple_decompress = device_param->hip_warp_size;
+      }
+
+      /*
+      * OCL
+      */
+      if (device_param->is_opencl == true)
+      {
+        // GPU memset
+
+        if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_memset", &device_param->opencl_kernel_memset) == -1) return -1;
+
+        if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_wgs_memset) == -1) return -1;
+
+        if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_local_mem_size_memset) == -1) return -1;
+
+        if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_dynamic_local_mem_size_memset) == -1) return -1;
+
+        if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_preferred_wgs_multiple_memset) == -1) return -1;
+
+        // GPU autotune init
+
+        if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_atinit", &device_param->opencl_kernel_atinit) == -1) return -1;
+
+        if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_wgs_atinit) == -1) return -1;
+
+        if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_local_mem_size_atinit) == -1) return -1;
+
+        if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_dynamic_local_mem_size_atinit) == -1) return -1;
+
+        if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_preferred_wgs_multiple_atinit) == -1) return -1;
+
+        // GPU decompress
+
+        if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_decompress", &device_param->opencl_kernel_decompress) == -1) return -1;
+
+        if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_wgs_decompress) == -1) return -1;
+
+        if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_local_mem_size_decompress) == -1) return -1;
+
+        if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_dynamic_local_mem_size_decompress) == -1) return -1;
+
+        if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_preferred_wgs_multiple_decompress) == -1) return -1;
+      }
+    }
+
+    /**
+     * main kernel
+     */
+
+    {
+      char *build_options_module_buf = (char *) hcmalloc (build_options_sz);
+
+      int build_options_module_len = 0;
+
+      build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s ", build_options_buf);
+
+      if (module_ctx->module_jit_build_options != MODULE_DEFAULT)
+      {
+        char *jit_build_options = module_ctx->module_jit_build_options (hashconfig, user_options, user_options_extra, hashes, device_param);
+
+        if (jit_build_options != NULL)
+        {
+          build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s", jit_build_options);
+
+          // this is a bit ugly
+          // would be better to have the module return the value as value
+
+          u32 fixed_local_size = 0;
+
+          if (sscanf (jit_build_options, "-D FIXED_LOCAL_SIZE=%u", &fixed_local_size) == 1)
+          {
+            device_param->kernel_threads_min = fixed_local_size;
+            device_param->kernel_threads_max = fixed_local_size;
+          }
+        }
+      }
+
+      build_options_module_buf[build_options_module_len] = 0;
+
+      #if defined (DEBUG)
+      if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_options_module '%s'", device_id + 1, build_options_module_buf);
+      #endif
+
+      /**
+       * kernel source filename
+       */
+
+      char source_file[256] = { 0 };
+
+      generate_source_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->shared_dir, source_file);
+
+      if (hc_path_read (source_file) == false)
+      {
+        event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
+
+        return -1;
+      }
+
+      /**
+       * kernel cached filename
+       */
+
+      char cached_file[256] = { 0 };
+
+      generate_cached_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->profile_dir, device_name_chksum, cached_file);
+
+      /**
+       * load kernel
+       */
+
+      const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "main_kernel", source_file, cached_file, build_options_module_buf, cache_disable, &device_param->opencl_program, &device_param->cuda_module, &device_param->hip_module);
+
+      if (rc_load_kernel == false)
+      {
+        event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
+
+        return -1;
+      }
+
+      hcfree (build_options_module_buf);
+    }
+
+    /**
+     * word generator kernel
+     */
+
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      if (user_options->attack_mode != ATTACK_MODE_STRAIGHT)
+      {
+        /**
+         * kernel mp source filename
+         */
+
+        char source_file[256] = { 0 };
+
+        generate_source_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->shared_dir, source_file);
+
+        if (hc_path_read (source_file) == false)
+        {
+          event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
+
+          return -1;
+        }
+
+        /**
+         * kernel mp cached filename
+         */
+
+        char cached_file[256] = { 0 };
+
+        generate_cached_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file);
+
+        const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "mp_kernel", source_file, cached_file, build_options_buf, cache_disable, &device_param->opencl_program_mp, &device_param->cuda_module_mp, &device_param->hip_module_mp);
+
+        if (rc_load_kernel == false)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
+
+          return -1;
+        }
+      }
+    }
+
+    /**
+     * amplifier kernel
+     */
+
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+      {
+
+      }
+      else
+      {
+        /**
+         * kernel amp source filename
+         */
+
+        char source_file[256] = { 0 };
+
+        generate_source_kernel_amp_filename (user_options_extra->attack_kern, folder_config->shared_dir, source_file);
+
+        if (hc_path_read (source_file) == false)
+        {
+          event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
+
+          return -1;
+        }
+
+        /**
+         * kernel amp cached filename
+         */
+
+        char cached_file[256] = { 0 };
+
+        generate_cached_kernel_amp_filename (user_options_extra->attack_kern, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file);
+
+        const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "amp_kernel", source_file, cached_file, build_options_buf, cache_disable, &device_param->opencl_program_amp, &device_param->cuda_module_amp, &device_param->hip_module_amp);
+
+        if (rc_load_kernel == false)
+        {
+          event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
+
+          return -1;
+        }
+
+        hcfree (build_options_buf);
+      }
+    }
+
+    hcfree (device_name_chksum);
+    hcfree (device_name_chksum_amp_mp);
+
+    // some algorithm collide too fast, make that impossible
+
+    if (user_options->benchmark == true)
+    {
+      ((u32 *) hashes->digests_buf)[0] = -1U;
+      ((u32 *) hashes->digests_buf)[1] = -1U;
+      ((u32 *) hashes->digests_buf)[2] = -1U;
+      ((u32 *) hashes->digests_buf)[3] = -1U;
+    }
+
+    /**
+     * global buffers
+     */
+
+    const u64 size_total_fixed
+      = bitmap_ctx->bitmap_size
+      + bitmap_ctx->bitmap_size
+      + bitmap_ctx->bitmap_size
+      + bitmap_ctx->bitmap_size
+      + bitmap_ctx->bitmap_size
+      + bitmap_ctx->bitmap_size
+      + bitmap_ctx->bitmap_size
+      + bitmap_ctx->bitmap_size
+      + size_plains
+      + size_digests
+      + size_shown
+      + size_salts
+      + size_results
+      + size_extra_buffer
+      + size_st_digests
+      + size_st_salts
+      + size_st_esalts
+      + size_esalts
+      + size_markov_css
+      + size_root_css
+      + size_rules
+      + size_rules_c
+      + size_tm;
+
+    if (size_total_fixed > device_param->device_available_mem)
+    {
+      event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory for this hashlist and/or ruleset.", device_id + 1);
+
+      return -1;
+    }
+
+    if (device_param->is_cuda == true)
+    {
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_a,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_b,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_c,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s1_d,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_a,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_b,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_c,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bitmap_s2_d,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_plain_bufs,     size_plains)             == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_digests_buf,    size_digests)            == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_digests_shown,  size_shown)              == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_salt_bufs,      size_salts)              == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_result,         size_results)            == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra0_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra1_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra2_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra3_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_digests_buf, size_st_digests)         == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_salts_buf,   size_st_salts)           == -1) return -1;
+
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_digests_buf, hashes->digests_buf,     size_digests)            == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_salt_bufs,   hashes->salts_buf,       size_salts)              == -1) return -1;
+
+      /**
+       * special buffers
+       */
+
+      if (user_options->slow_candidates == true)
+      {
+        if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules_c, size_rules_c) == -1) return -1;
+      }
+      else
+      {
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules,   size_rules) == -1) return -1;
+
+          if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+          {
+            size_t dummy = 0;
+
+            if (hc_cuModuleGetGlobal (hashcat_ctx, &device_param->cuda_d_rules_c, &dummy, device_param->cuda_module, "generic_constant") == -1) return -1;
+          }
+          else
+          {
+            if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules_c, size_rules_c) == -1) return -1;
+          }
+
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_rules, straight_ctx->kernel_rules_buf, size_rules) == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_combs,          size_combs)      == -1) return -1;
+          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_combs_c,        size_combs)      == -1) return -1;
+          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_root_css_buf,   size_root_css)   == -1) return -1;
+          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_markov_css_buf, size_markov_css) == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bfs,            size_bfs)        == -1) return -1;
+          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_root_css_buf,   size_root_css)   == -1) return -1;
+          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_markov_css_buf, size_markov_css) == -1) return -1;
+
+          if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+          {
+            size_t dummy = 0;
+
+            if (hc_cuModuleGetGlobal (hashcat_ctx, &device_param->cuda_d_bfs_c, &dummy, device_param->cuda_module, "generic_constant") == -1) return -1;
+
+            if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_tm_c,           size_tm)       == -1) return -1;
+          }
+          else
+          {
+            if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bfs_c,          size_bfs)      == -1) return -1;
+            if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_tm_c,           size_tm)       == -1) return -1;
+          }
+        }
+      }
+
+      if (size_esalts)
+      {
+        if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_esalt_bufs, size_esalts) == -1) return -1;
+
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_esalt_bufs, hashes->esalts_buf, size_esalts) == -1) return -1;
+      }
+
+      if (hashconfig->st_hash != NULL)
+      {
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_digests_buf, hashes->st_digests_buf, size_st_digests) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts)   == -1) return -1;
+
+        if (size_esalts)
+        {
+          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_esalts_buf, size_st_esalts) == -1) return -1;
+
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts) == -1) return -1;
+        }
+      }
+    }
+
+    /*
+    * HIP
+    */
+    if (device_param->is_hip == true)
+    {
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_bitmap_s1_a,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_bitmap_s1_b,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_bitmap_s1_c,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_bitmap_s1_d,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_bitmap_s2_a,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_bitmap_s2_b,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_bitmap_s2_c,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_bitmap_s2_d,    bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_plain_bufs,     size_plains)             == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_digests_buf,    size_digests)            == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_digests_shown,  size_shown)              == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_salt_bufs,      size_salts)              == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_result,         size_results)            == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra0_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra1_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra2_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra3_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_digests_buf, size_st_digests)         == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_salts_buf,   size_st_salts)           == -1) return -1;
+
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_digests_buf, hashes->digests_buf,     size_digests)            == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_salt_bufs,   hashes->salts_buf,       size_salts)              == -1) return -1;
+
+      /**
+       * special buffers
+       */
+
+      if (user_options->slow_candidates == true)
+      {
+        if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_rules_c, size_rules_c) == -1) return -1;
+      }
+      else
+      {
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_rules,   size_rules) == -1) return -1;
+
+          if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+          {
+            size_t dummy = 0;
+
+            if (hc_hipModuleGetGlobal (hashcat_ctx, &device_param->hip_d_rules_c, &dummy, device_param->hip_module, "generic_constant") == -1) return -1;
+          }
+          else
+          {
+            if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_rules_c, size_rules_c) == -1) return -1;
+          }
+
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_rules, straight_ctx->kernel_rules_buf, size_rules) == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_combs,          size_combs)      == -1) return -1;
+          if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_combs_c,        size_combs)      == -1) return -1;
+          if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_root_css_buf,   size_root_css)   == -1) return -1;
+          if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_markov_css_buf, size_markov_css) == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_bfs,            size_bfs)        == -1) return -1;
+          if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_root_css_buf,   size_root_css)   == -1) return -1;
+          if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_markov_css_buf, size_markov_css) == -1) return -1;
+
+          if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+          {
+            size_t dummy = 0;
+
+            if (hc_hipModuleGetGlobal (hashcat_ctx, &device_param->hip_d_bfs_c, &dummy, device_param->hip_module, "generic_constant") == -1) return -1;
+
+            if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_tm_c,           size_tm)       == -1) return -1;
+          }
+          else
+          {
+            if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_bfs_c,          size_bfs)      == -1) return -1;
+            if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_tm_c,           size_tm)       == -1) return -1;
+          }
+        }
+      }
+
+      if (size_esalts)
+      {
+        if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_esalt_bufs, size_esalts) == -1) return -1;
+
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_esalt_bufs, hashes->esalts_buf, size_esalts) == -1) return -1;
+      }
+
+      if (hashconfig->st_hash != NULL)
+      {
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_st_digests_buf, hashes->st_digests_buf, size_st_digests) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts)   == -1) return -1;
+
+        if (size_esalts)
+        {
+          if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_esalts_buf, size_st_esalts) == -1) return -1;
+
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts) == -1) return -1;
+        }
+      }
+    }
+
+    /*
+    * OCL
+    */
+    if (device_param->is_opencl == true)
+    {
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_a)    == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_b)    == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_c)    == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_d)    == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_a)    == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_b)    == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_c)    == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_d)    == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_plains,             NULL, &device_param->opencl_d_plain_bufs)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_digests,            NULL, &device_param->opencl_d_digests_buf)    == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_shown,              NULL, &device_param->opencl_d_digests_shown)  == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_salts,              NULL, &device_param->opencl_d_salt_bufs)      == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_results,            NULL, &device_param->opencl_d_result)         == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra0_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra1_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra2_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra3_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_digests,         NULL, &device_param->opencl_d_st_digests_buf) == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_salts,           NULL, &device_param->opencl_d_st_salts_buf)   == -1) return -1;
+
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_a, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_a, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_b, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_b, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_c, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_c, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_d, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_d, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_a, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_a, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_b, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_b, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_c, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_c, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_d, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_d, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_digests_buf, CL_TRUE, 0, size_digests,            hashes->digests_buf,     0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_salt_bufs,   CL_TRUE, 0, size_salts,              hashes->salts_buf,       0, NULL, NULL) == -1) return -1;
+
+      /**
+       * special buffers
+       */
+
+      if (user_options->slow_candidates == true)
+      {
+        if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules_c, NULL, &device_param->opencl_d_rules_c)   == -1) return -1;
+      }
+      else
+      {
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules,   NULL, &device_param->opencl_d_rules)   == -1) return -1;
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules_c, NULL, &device_param->opencl_d_rules_c) == -1) return -1;
+
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, CL_TRUE, 0, size_rules, straight_ctx->kernel_rules_buf, 0, NULL, NULL) == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_combs,      NULL, &device_param->opencl_d_combs)          == -1) return -1;
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_combs,      NULL, &device_param->opencl_d_combs_c)        == -1) return -1;
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_root_css,   NULL, &device_param->opencl_d_root_css_buf)   == -1) return -1;
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_markov_css, NULL, &device_param->opencl_d_markov_css_buf) == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_bfs,        NULL, &device_param->opencl_d_bfs)            == -1) return -1;
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_bfs,        NULL, &device_param->opencl_d_bfs_c)          == -1) return -1;
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_tm,         NULL, &device_param->opencl_d_tm_c)           == -1) return -1;
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_root_css,   NULL, &device_param->opencl_d_root_css_buf)   == -1) return -1;
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_markov_css, NULL, &device_param->opencl_d_markov_css_buf) == -1) return -1;
+        }
+      }
+
+      if (size_esalts)
+      {
+        if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_esalts, NULL, &device_param->opencl_d_esalt_bufs) == -1) return -1;
+
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_esalt_bufs, CL_TRUE, 0, size_esalts, hashes->esalts_buf, 0, NULL, NULL) == -1) return -1;
+      }
+
+      if (hashconfig->st_hash != NULL)
+      {
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_digests_buf,  CL_TRUE, 0, size_st_digests,         hashes->st_digests_buf,  0, NULL, NULL) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_salts_buf,    CL_TRUE, 0, size_st_salts,           hashes->st_salts_buf,    0, NULL, NULL) == -1) return -1;
+
+        if (size_esalts)
+        {
+          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_st_esalts, NULL, &device_param->opencl_d_st_esalts_buf) == -1) return -1;
+
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_esalts_buf, CL_TRUE, 0, size_st_esalts, hashes->st_esalts_buf, 0, NULL, NULL) == -1) return -1;
+        }
+      }
+    }
+
+    /**
+     * kernel args
+     */
+
+    device_param->kernel_params_buf32[24] = bitmap_ctx->bitmap_mask;
+    device_param->kernel_params_buf32[25] = bitmap_ctx->bitmap_shift1;
+    device_param->kernel_params_buf32[26] = bitmap_ctx->bitmap_shift2;
+    device_param->kernel_params_buf32[27] = 0; // salt_pos
+    device_param->kernel_params_buf32[28] = 0; // loop_pos
+    device_param->kernel_params_buf32[29] = 0; // loop_cnt
+    device_param->kernel_params_buf32[30] = 0; // kernel_rules_cnt
+    device_param->kernel_params_buf32[31] = 0; // digests_cnt
+    device_param->kernel_params_buf32[32] = 0; // digests_offset
+    device_param->kernel_params_buf32[33] = 0; // combs_mode
+    device_param->kernel_params_buf64[34] = 0; // gid_max
+
+    if (device_param->is_cuda == true)
+    {
+      device_param->kernel_params[ 0] = NULL; // &device_param->cuda_d_pws_buf;
+      device_param->kernel_params[ 1] = &device_param->cuda_d_rules_c;
+      device_param->kernel_params[ 2] = &device_param->cuda_d_combs_c;
+      device_param->kernel_params[ 3] = &device_param->cuda_d_bfs_c;
+      device_param->kernel_params[ 4] = NULL; // &device_param->cuda_d_tmps;
+      device_param->kernel_params[ 5] = NULL; // &device_param->cuda_d_hooks;
+      device_param->kernel_params[ 6] = &device_param->cuda_d_bitmap_s1_a;
+      device_param->kernel_params[ 7] = &device_param->cuda_d_bitmap_s1_b;
+      device_param->kernel_params[ 8] = &device_param->cuda_d_bitmap_s1_c;
+      device_param->kernel_params[ 9] = &device_param->cuda_d_bitmap_s1_d;
+      device_param->kernel_params[10] = &device_param->cuda_d_bitmap_s2_a;
+      device_param->kernel_params[11] = &device_param->cuda_d_bitmap_s2_b;
+      device_param->kernel_params[12] = &device_param->cuda_d_bitmap_s2_c;
+      device_param->kernel_params[13] = &device_param->cuda_d_bitmap_s2_d;
+      device_param->kernel_params[14] = &device_param->cuda_d_plain_bufs;
+      device_param->kernel_params[15] = &device_param->cuda_d_digests_buf;
+      device_param->kernel_params[16] = &device_param->cuda_d_digests_shown;
+      device_param->kernel_params[17] = &device_param->cuda_d_salt_bufs;
+      device_param->kernel_params[18] = &device_param->cuda_d_esalt_bufs;
+      device_param->kernel_params[19] = &device_param->cuda_d_result;
+      device_param->kernel_params[20] = &device_param->cuda_d_extra0_buf;
+      device_param->kernel_params[21] = &device_param->cuda_d_extra1_buf;
+      device_param->kernel_params[22] = &device_param->cuda_d_extra2_buf;
+      device_param->kernel_params[23] = &device_param->cuda_d_extra3_buf;
+    }
+
+    /*
+    * HIP
+    */
+    if (device_param->is_hip == true)
+    {
+      device_param->kernel_params[ 0] = NULL; // &device_param->hip_d_pws_buf;
+      device_param->kernel_params[ 1] = &device_param->hip_d_rules_c;
+      device_param->kernel_params[ 2] = &device_param->hip_d_combs_c;
+      device_param->kernel_params[ 3] = &device_param->hip_d_bfs_c;
+      device_param->kernel_params[ 4] = NULL; // &device_param->hip_d_tmps;
+      device_param->kernel_params[ 5] = NULL; // &device_param->hip_d_hooks;
+      device_param->kernel_params[ 6] = &device_param->hip_d_bitmap_s1_a;
+      device_param->kernel_params[ 7] = &device_param->hip_d_bitmap_s1_b;
+      device_param->kernel_params[ 8] = &device_param->hip_d_bitmap_s1_c;
+      device_param->kernel_params[ 9] = &device_param->hip_d_bitmap_s1_d;
+      device_param->kernel_params[10] = &device_param->hip_d_bitmap_s2_a;
+      device_param->kernel_params[11] = &device_param->hip_d_bitmap_s2_b;
+      device_param->kernel_params[12] = &device_param->hip_d_bitmap_s2_c;
+      device_param->kernel_params[13] = &device_param->hip_d_bitmap_s2_d;
+      device_param->kernel_params[14] = &device_param->hip_d_plain_bufs;
+      device_param->kernel_params[15] = &device_param->hip_d_digests_buf;
+      device_param->kernel_params[16] = &device_param->hip_d_digests_shown;
+      device_param->kernel_params[17] = &device_param->hip_d_salt_bufs;
+      device_param->kernel_params[18] = &device_param->hip_d_esalt_bufs;
+      device_param->kernel_params[19] = &device_param->hip_d_result;
+      device_param->kernel_params[20] = &device_param->hip_d_extra0_buf;
+      device_param->kernel_params[21] = &device_param->hip_d_extra1_buf;
+      device_param->kernel_params[22] = &device_param->hip_d_extra2_buf;
+      device_param->kernel_params[23] = &device_param->hip_d_extra3_buf;
+    }
+
+    /*
+    * OCL
+    */
+    if (device_param->is_opencl == true)
+    {
+      device_param->kernel_params[ 0] = NULL; // &device_param->opencl_d_pws_buf;
+      device_param->kernel_params[ 1] = &device_param->opencl_d_rules_c;
+      device_param->kernel_params[ 2] = &device_param->opencl_d_combs_c;
+      device_param->kernel_params[ 3] = &device_param->opencl_d_bfs_c;
+      device_param->kernel_params[ 4] = NULL; // &device_param->opencl_d_tmps;
+      device_param->kernel_params[ 5] = NULL; // &device_param->opencl_d_hooks;
+      device_param->kernel_params[ 6] = &device_param->opencl_d_bitmap_s1_a;
+      device_param->kernel_params[ 7] = &device_param->opencl_d_bitmap_s1_b;
+      device_param->kernel_params[ 8] = &device_param->opencl_d_bitmap_s1_c;
+      device_param->kernel_params[ 9] = &device_param->opencl_d_bitmap_s1_d;
+      device_param->kernel_params[10] = &device_param->opencl_d_bitmap_s2_a;
+      device_param->kernel_params[11] = &device_param->opencl_d_bitmap_s2_b;
+      device_param->kernel_params[12] = &device_param->opencl_d_bitmap_s2_c;
+      device_param->kernel_params[13] = &device_param->opencl_d_bitmap_s2_d;
+      device_param->kernel_params[14] = &device_param->opencl_d_plain_bufs;
+      device_param->kernel_params[15] = &device_param->opencl_d_digests_buf;
+      device_param->kernel_params[16] = &device_param->opencl_d_digests_shown;
+      device_param->kernel_params[17] = &device_param->opencl_d_salt_bufs;
+      device_param->kernel_params[18] = &device_param->opencl_d_esalt_bufs;
+      device_param->kernel_params[19] = &device_param->opencl_d_result;
+      device_param->kernel_params[20] = &device_param->opencl_d_extra0_buf;
+      device_param->kernel_params[21] = &device_param->opencl_d_extra1_buf;
+      device_param->kernel_params[22] = &device_param->opencl_d_extra2_buf;
+      device_param->kernel_params[23] = &device_param->opencl_d_extra3_buf;
+    }
+
+    device_param->kernel_params[24] = &device_param->kernel_params_buf32[24];
+    device_param->kernel_params[25] = &device_param->kernel_params_buf32[25];
+    device_param->kernel_params[26] = &device_param->kernel_params_buf32[26];
+    device_param->kernel_params[27] = &device_param->kernel_params_buf32[27];
+    device_param->kernel_params[28] = &device_param->kernel_params_buf32[28];
+    device_param->kernel_params[29] = &device_param->kernel_params_buf32[29];
+    device_param->kernel_params[30] = &device_param->kernel_params_buf32[30];
+    device_param->kernel_params[31] = &device_param->kernel_params_buf32[31];
+    device_param->kernel_params[32] = &device_param->kernel_params_buf32[32];
+    device_param->kernel_params[33] = &device_param->kernel_params_buf32[33];
+    device_param->kernel_params[34] = &device_param->kernel_params_buf64[34];
+
+    if (user_options->slow_candidates == true)
+    {
+    }
+    else
+    {
+      device_param->kernel_params_mp_buf64[3] = 0;
+      device_param->kernel_params_mp_buf32[4] = 0;
+      device_param->kernel_params_mp_buf32[5] = 0;
+      device_param->kernel_params_mp_buf32[6] = 0;
+      device_param->kernel_params_mp_buf32[7] = 0;
+      device_param->kernel_params_mp_buf64[8] = 0;
+
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        if (device_param->is_cuda == true)
+        {
+          device_param->kernel_params_mp[0] = &device_param->cuda_d_combs;
+        }
+
+        if (device_param->is_hip == true)
+        {
+          device_param->kernel_params_mp[0] = &device_param->hip_d_combs;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          device_param->kernel_params_mp[0] = &device_param->opencl_d_combs;
+        }
+      }
+      else
+      {
+        if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+        {
+          if (device_param->is_cuda == true)
+          {
+            device_param->kernel_params_mp[0] = &device_param->cuda_d_combs;
+          }
+
+          if (device_param->is_hip == true)
+          {
+            device_param->kernel_params_mp[0] = &device_param->hip_d_combs;
+          }
+
+          if (device_param->is_opencl == true)
+          {
+            device_param->kernel_params_mp[0] = &device_param->opencl_d_combs;
+          }
+        }
+        else
+        {
+          device_param->kernel_params_mp[0] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                    // ? &device_param->opencl_d_pws_buf
+                                                    // : &device_param->opencl_d_pws_amp_buf;
+        }
+      }
+
+      if (device_param->is_cuda == true)
+      {
+        device_param->kernel_params_mp[1] = &device_param->cuda_d_root_css_buf;
+        device_param->kernel_params_mp[2] = &device_param->cuda_d_markov_css_buf;
+      }
+
+      if (device_param->is_hip == true)
+      {
+        device_param->kernel_params_mp[1] = &device_param->hip_d_root_css_buf;
+        device_param->kernel_params_mp[2] = &device_param->hip_d_markov_css_buf;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        device_param->kernel_params_mp[1] = &device_param->opencl_d_root_css_buf;
+        device_param->kernel_params_mp[2] = &device_param->opencl_d_markov_css_buf;
+      }
+
+      device_param->kernel_params_mp[3] = &device_param->kernel_params_mp_buf64[3];
+      device_param->kernel_params_mp[4] = &device_param->kernel_params_mp_buf32[4];
+      device_param->kernel_params_mp[5] = &device_param->kernel_params_mp_buf32[5];
+      device_param->kernel_params_mp[6] = &device_param->kernel_params_mp_buf32[6];
+      device_param->kernel_params_mp[7] = &device_param->kernel_params_mp_buf32[7];
+      device_param->kernel_params_mp[8] = &device_param->kernel_params_mp_buf64[8];
+
+      device_param->kernel_params_mp_l_buf64[3] = 0;
+      device_param->kernel_params_mp_l_buf32[4] = 0;
+      device_param->kernel_params_mp_l_buf32[5] = 0;
+      device_param->kernel_params_mp_l_buf32[6] = 0;
+      device_param->kernel_params_mp_l_buf32[7] = 0;
+      device_param->kernel_params_mp_l_buf32[8] = 0;
+      device_param->kernel_params_mp_l_buf64[9] = 0;
+
+      device_param->kernel_params_mp_l[0] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                  // ? &device_param->opencl_d_pws_buf
+                                                  // : &device_param->opencl_d_pws_amp_buf;
+      if (device_param->is_cuda == true)
+      {
+        device_param->kernel_params_mp_l[1] = &device_param->cuda_d_root_css_buf;
+        device_param->kernel_params_mp_l[2] = &device_param->cuda_d_markov_css_buf;
+      }
+
+      if (device_param->is_hip == true)
+      {
+        device_param->kernel_params_mp_l[1] = &device_param->hip_d_root_css_buf;
+        device_param->kernel_params_mp_l[2] = &device_param->hip_d_markov_css_buf;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        device_param->kernel_params_mp_l[1] = &device_param->opencl_d_root_css_buf;
+        device_param->kernel_params_mp_l[2] = &device_param->opencl_d_markov_css_buf;
+      }
+
+      device_param->kernel_params_mp_l[3] = &device_param->kernel_params_mp_l_buf64[3];
+      device_param->kernel_params_mp_l[4] = &device_param->kernel_params_mp_l_buf32[4];
+      device_param->kernel_params_mp_l[5] = &device_param->kernel_params_mp_l_buf32[5];
+      device_param->kernel_params_mp_l[6] = &device_param->kernel_params_mp_l_buf32[6];
+      device_param->kernel_params_mp_l[7] = &device_param->kernel_params_mp_l_buf32[7];
+      device_param->kernel_params_mp_l[8] = &device_param->kernel_params_mp_l_buf32[8];
+      device_param->kernel_params_mp_l[9] = &device_param->kernel_params_mp_l_buf64[9];
+
+      device_param->kernel_params_mp_r_buf64[3] = 0;
+      device_param->kernel_params_mp_r_buf32[4] = 0;
+      device_param->kernel_params_mp_r_buf32[5] = 0;
+      device_param->kernel_params_mp_r_buf32[6] = 0;
+      device_param->kernel_params_mp_r_buf32[7] = 0;
+      device_param->kernel_params_mp_r_buf64[8] = 0;
+
+      if (device_param->is_cuda == true)
+      {
+        device_param->kernel_params_mp_r[0] = &device_param->cuda_d_bfs;
+        device_param->kernel_params_mp_r[1] = &device_param->cuda_d_root_css_buf;
+        device_param->kernel_params_mp_r[2] = &device_param->cuda_d_markov_css_buf;
+      }
+
+      if (device_param->is_hip == true)
+      {
+        device_param->kernel_params_mp_r[0] = &device_param->hip_d_bfs;
+        device_param->kernel_params_mp_r[1] = &device_param->hip_d_root_css_buf;
+        device_param->kernel_params_mp_r[2] = &device_param->hip_d_markov_css_buf;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        device_param->kernel_params_mp_r[0] = &device_param->opencl_d_bfs;
+        device_param->kernel_params_mp_r[1] = &device_param->opencl_d_root_css_buf;
+        device_param->kernel_params_mp_r[2] = &device_param->opencl_d_markov_css_buf;
+      }
+
+      device_param->kernel_params_mp_r[3] = &device_param->kernel_params_mp_r_buf64[3];
+      device_param->kernel_params_mp_r[4] = &device_param->kernel_params_mp_r_buf32[4];
+      device_param->kernel_params_mp_r[5] = &device_param->kernel_params_mp_r_buf32[5];
+      device_param->kernel_params_mp_r[6] = &device_param->kernel_params_mp_r_buf32[6];
+      device_param->kernel_params_mp_r[7] = &device_param->kernel_params_mp_r_buf32[7];
+      device_param->kernel_params_mp_r[8] = &device_param->kernel_params_mp_r_buf64[8];
+
+      device_param->kernel_params_amp_buf32[5] = 0; // combs_mode
+      device_param->kernel_params_amp_buf64[6] = 0; // gid_max
+
+      if (device_param->is_cuda == true)
+      {
+        device_param->kernel_params_amp[0] = NULL; // &device_param->cuda_d_pws_buf;
+        device_param->kernel_params_amp[1] = NULL; // &device_param->cuda_d_pws_amp_buf;
+        device_param->kernel_params_amp[2] = &device_param->cuda_d_rules_c;
+        device_param->kernel_params_amp[3] = &device_param->cuda_d_combs_c;
+        device_param->kernel_params_amp[4] = &device_param->cuda_d_bfs_c;
+      }
+
+      if (device_param->is_hip == true)
+      {
+        device_param->kernel_params_amp[0] = NULL; // &device_param->hip_d_pws_buf;
+        device_param->kernel_params_amp[1] = NULL; // &device_param->hip_d_pws_amp_buf;
+        device_param->kernel_params_amp[2] = &device_param->hip_d_rules_c;
+        device_param->kernel_params_amp[3] = &device_param->hip_d_combs_c;
+        device_param->kernel_params_amp[4] = &device_param->hip_d_bfs_c;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        device_param->kernel_params_amp[0] = NULL; // &device_param->opencl_d_pws_buf;
+        device_param->kernel_params_amp[1] = NULL; // &device_param->opencl_d_pws_amp_buf;
+        device_param->kernel_params_amp[2] = &device_param->opencl_d_rules_c;
+        device_param->kernel_params_amp[3] = &device_param->opencl_d_combs_c;
+        device_param->kernel_params_amp[4] = &device_param->opencl_d_bfs_c;
+      }
+
+      device_param->kernel_params_amp[5] = &device_param->kernel_params_amp_buf32[5];
+      device_param->kernel_params_amp[6] = &device_param->kernel_params_amp_buf64[6];
+
+      if (device_param->is_cuda == true)
+      {
+        device_param->kernel_params_tm[0] = &device_param->cuda_d_bfs_c;
+        device_param->kernel_params_tm[1] = &device_param->cuda_d_tm_c;
+      }
+
+      if (device_param->is_hip == true)
+      {
+        device_param->kernel_params_tm[0] = &device_param->hip_d_bfs_c;
+        device_param->kernel_params_tm[1] = &device_param->hip_d_tm_c;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        device_param->kernel_params_tm[0] = &device_param->opencl_d_bfs_c;
+        device_param->kernel_params_tm[1] = &device_param->opencl_d_tm_c;
+      }
+    }
+
+    device_param->kernel_params_memset_buf32[1] = 0; // value
+    device_param->kernel_params_memset_buf64[2] = 0; // gid_max
+
+    device_param->kernel_params_memset[0] = NULL;
+    device_param->kernel_params_memset[1] = &device_param->kernel_params_memset_buf32[1];
+    device_param->kernel_params_memset[2] = &device_param->kernel_params_memset_buf64[2];
+
+    device_param->kernel_params_atinit_buf64[1] = 0; // gid_max
+
+    device_param->kernel_params_atinit[0] = NULL;
+    device_param->kernel_params_atinit[1] = &device_param->kernel_params_atinit_buf64[1];
+
+    device_param->kernel_params_decompress_buf64[3] = 0; // gid_max
+
+    if (device_param->is_cuda == true)
+    {
+      device_param->kernel_params_decompress[0] = NULL; // &device_param->cuda_d_pws_idx;
+      device_param->kernel_params_decompress[1] = NULL; // &device_param->cuda_d_pws_comp_buf;
+      device_param->kernel_params_decompress[2] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                        // ? &device_param->cuda_d_pws_buf
+                                                        // : &device_param->cuda_d_pws_amp_buf;
+    }
+
+    if (device_param->is_hip == true)
+    {
+      device_param->kernel_params_decompress[0] = NULL; // &device_param->hip_d_pws_idx;
+      device_param->kernel_params_decompress[1] = NULL; // &device_param->hip_d_pws_comp_buf;
+      device_param->kernel_params_decompress[2] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                        // ? &device_param->hip_d_pws_buf
+                                                        // : &device_param->hip_d_pws_amp_buf;
+    }
+
+    if (device_param->is_opencl == true)
+    {
+      device_param->kernel_params_decompress[0] = NULL; // &device_param->opencl_d_pws_idx;
+      device_param->kernel_params_decompress[1] = NULL; // &device_param->opencl_d_pws_comp_buf;
+      device_param->kernel_params_decompress[2] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                        // ? &device_param->opencl_d_pws_buf
+                                                        // : &device_param->opencl_d_pws_amp_buf;
+    }
+
+    device_param->kernel_params_decompress[3] = &device_param->kernel_params_decompress_buf64[3];
+
+    /**
+     * kernel name
+     */
+
+    if (device_param->is_cuda == true)
+    {
+      char kernel_name[64] = { 0 };
+
+      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SINGLE_HASH)
+        {
+          if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+          {
+            // kernel1
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 4);
+
+            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function1, device_param->cuda_module, kernel_name) == -1) return -1;
+
+            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_wgs1) == -1) return -1;
+
+            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1) == -1) return -1;
+
+            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size;
+
+            // kernel2
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 8);
+
+            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2, device_param->cuda_module, kernel_name) == -1) return -1;
+
+            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_wgs2) == -1) return -1;
+
+            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2) == -1) return -1;
+
+            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size;
+
+            // kernel3
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 16);
+
+            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function3, device_param->cuda_module, kernel_name) == -1) return -1;
+
+            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_wgs3) == -1) return -1;
+
+            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3) == -1) return -1;
+
+            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size;
+          }
+          else
           {
-            if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules_c, size_rules_c) == -1) return -1;
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_sxx", kern_type);
+
+            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function4, device_param->cuda_module, kernel_name) == -1) return -1;
+
+            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_wgs4) == -1) return -1;
+
+            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_local_mem_size4) == -1) return -1;
+
+            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_dynamic_local_mem_size4) == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple4 = device_param->cuda_warp_size;
           }
+        }
+        else
+        {
+          if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+          {
+            // kernel1
 
-          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_rules, straight_ctx->kernel_rules_buf, size_rules) == -1) return -1;
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 4);
+
+            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function1, device_param->cuda_module, kernel_name) == -1) return -1;
+
+            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_wgs1) == -1) return -1;
+
+            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1) == -1) return -1;
+
+            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size;
+
+            // kernel2
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 8);
+
+            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2, device_param->cuda_module, kernel_name) == -1) return -1;
+
+            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_wgs2) == -1) return -1;
+
+            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2) == -1) return -1;
+
+            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size;
+
+            // kernel3
+
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 16);
+
+            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function3, device_param->cuda_module, kernel_name) == -1) return -1;
+
+            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_wgs3) == -1) return -1;
+
+            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3) == -1) return -1;
+
+            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size;
+          }
+          else
+          {
+            snprintf (kernel_name, sizeof (kernel_name), "m%05u_mxx", kern_type);
+
+            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function4, device_param->cuda_module, kernel_name) == -1) return -1;
+
+            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_wgs4) == -1) return -1;
+
+            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_local_mem_size4) == -1) return -1;
+
+            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_dynamic_local_mem_size4) == -1) return -1;
+
+            device_param->kernel_preferred_wgs_multiple4 = device_param->cuda_warp_size;
+          }
         }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+
+        if (user_options->slow_candidates == true)
         {
-          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_combs,          size_combs)      == -1) return -1;
-          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_combs_c,        size_combs)      == -1) return -1;
-          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_root_css_buf,   size_root_css)   == -1) return -1;
-          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_markov_css_buf, size_markov_css) == -1) return -1;
         }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        else
         {
-          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bfs,            size_bfs)        == -1) return -1;
-          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_root_css_buf,   size_root_css)   == -1) return -1;
-          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_markov_css_buf, size_markov_css) == -1) return -1;
+          if (user_options->attack_mode == ATTACK_MODE_BF)
+          {
+            if (hashconfig->opts_type & OPTS_TYPE_TM_KERNEL)
+            {
+              snprintf (kernel_name, sizeof (kernel_name), "m%05u_tm", kern_type);
+
+              if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_tm, device_param->cuda_module, kernel_name) == -1) return -1;
+
+              if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_tm, &device_param->kernel_wgs_tm) == -1) return -1;
+
+              if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_tm, &device_param->kernel_local_mem_size_tm) == -1) return -1;
+
+              if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_tm, &device_param->kernel_dynamic_local_mem_size_tm) == -1) return -1;
+
+              device_param->kernel_preferred_wgs_multiple_tm = device_param->cuda_warp_size;
+            }
+          }
+        }
+      }
+      else
+      {
+        // kernel1
+
+        snprintf (kernel_name, sizeof (kernel_name), "m%05u_init", kern_type);
+
+        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function1, device_param->cuda_module, kernel_name) == -1) return -1;
+
+        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_wgs1) == -1) return -1;
+
+        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1) == -1) return -1;
+
+        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size;
+
+        // kernel2
+
+        snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop", kern_type);
+
+        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2, device_param->cuda_module, kernel_name) == -1) return -1;
+
+        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_wgs2) == -1) return -1;
+
+        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2) == -1) return -1;
+
+        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size;
+
+        // kernel3
+
+        snprintf (kernel_name, sizeof (kernel_name), "m%05u_comp", kern_type);
+
+        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function3, device_param->cuda_module, kernel_name) == -1) return -1;
+
+        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_wgs3) == -1) return -1;
+
+        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3) == -1) return -1;
+
+        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size;
+
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
+        {
+          // kernel2e
+
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop_extended", kern_type);
+
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2e, device_param->cuda_module, kernel_name) == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2e, &device_param->kernel_wgs2e) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2e, &device_param->kernel_local_mem_size2e) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2e, &device_param->kernel_dynamic_local_mem_size2e) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple2e = device_param->cuda_warp_size;
+        }
+
+        // kernel12
+
+        if (hashconfig->opts_type & OPTS_TYPE_HOOK12)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_hook12", kern_type);
+
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function12, device_param->cuda_module, kernel_name) == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function12, &device_param->kernel_wgs12) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function12, &device_param->kernel_local_mem_size12) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function12, &device_param->kernel_dynamic_local_mem_size12) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple12 = device_param->cuda_warp_size;
+        }
+
+        // kernel23
+
+        if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_hook23", kern_type);
+
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function23, device_param->cuda_module, kernel_name) == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function23, &device_param->kernel_wgs23) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function23, &device_param->kernel_local_mem_size23) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function23, &device_param->kernel_dynamic_local_mem_size23) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple23 = device_param->cuda_warp_size;
+        }
+
+        // init2
+
+        if (hashconfig->opts_type & OPTS_TYPE_INIT2)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_init2", kern_type);
+
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_init2, device_param->cuda_module, kernel_name) == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_init2, &device_param->kernel_wgs_init2) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_init2, &device_param->kernel_local_mem_size_init2) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_init2, &device_param->kernel_dynamic_local_mem_size_init2) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_init2 = device_param->cuda_warp_size;
+        }
+
+        // loop2
+
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop2", kern_type);
 
-          if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-          {
-            size_t dummy = 0;
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_loop2, device_param->cuda_module, kernel_name) == -1) return -1;
 
-            if (hc_cuModuleGetGlobal (hashcat_ctx, &device_param->cuda_d_bfs_c, &dummy, device_param->cuda_module, "generic_constant") == -1) return -1;
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_loop2, &device_param->kernel_wgs_loop2) == -1) return -1;
 
-            if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_tm_c,           size_tm)       == -1) return -1;
-          }
-          else
-          {
-            if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bfs_c,          size_bfs)      == -1) return -1;
-            if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_tm_c,           size_tm)       == -1) return -1;
-          }
-        }
-      }
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2, &device_param->kernel_local_mem_size_loop2) == -1) return -1;
 
-      if (size_esalts)
-      {
-        if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_esalt_bufs, size_esalts) == -1) return -1;
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2, &device_param->kernel_dynamic_local_mem_size_loop2) == -1) return -1;
 
-        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_esalt_bufs, hashes->esalts_buf, size_esalts) == -1) return -1;
-      }
+          device_param->kernel_preferred_wgs_multiple_loop2 = device_param->cuda_warp_size;
+        }
 
-      if (hashconfig->st_hash != NULL)
-      {
-        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_digests_buf, hashes->st_digests_buf, size_st_digests) == -1) return -1;
-        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts)   == -1) return -1;
+        // aux1
 
-        if (size_esalts)
+        if (hashconfig->opts_type & OPTS_TYPE_AUX1)
         {
-          if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_esalts_buf, size_st_esalts) == -1) return -1;
-
-          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts) == -1) return -1;
-        }
-      }
-    }
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux1", kern_type);
 
-    if (device_param->is_opencl == true)
-    {
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_a)    == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_b)    == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_c)    == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s1_d)    == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_a)    == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_b)    == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_c)    == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   bitmap_ctx->bitmap_size, NULL, &device_param->opencl_d_bitmap_s2_d)    == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_plains,             NULL, &device_param->opencl_d_plain_bufs)     == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_digests,            NULL, &device_param->opencl_d_digests_buf)    == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_shown,              NULL, &device_param->opencl_d_digests_shown)  == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_salts,              NULL, &device_param->opencl_d_salt_bufs)      == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_results,            NULL, &device_param->opencl_d_result)         == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra0_buf)     == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra1_buf)     == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra2_buf)     == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra3_buf)     == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_digests,         NULL, &device_param->opencl_d_st_digests_buf) == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_salts,           NULL, &device_param->opencl_d_st_salts_buf)   == -1) return -1;
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux1, device_param->cuda_module, kernel_name) == -1) return -1;
 
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_a, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_a, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_b, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_b, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_c, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_c, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_d, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_d, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_a, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_a, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_b, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_b, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_c, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_c, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_d, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_d, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_digests_buf, CL_TRUE, 0, size_digests,            hashes->digests_buf,     0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_salt_bufs,   CL_TRUE, 0, size_salts,              hashes->salts_buf,       0, NULL, NULL) == -1) return -1;
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux1, &device_param->kernel_wgs_aux1) == -1) return -1;
 
-      /**
-       * special buffers
-       */
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux1, &device_param->kernel_local_mem_size_aux1) == -1) return -1;
 
-      if (user_options->slow_candidates == true)
-      {
-        if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules_c, NULL, &device_param->opencl_d_rules_c)   == -1) return -1;
-      }
-      else
-      {
-        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-        {
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules,   NULL, &device_param->opencl_d_rules)   == -1) return -1;
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules_c, NULL, &device_param->opencl_d_rules_c) == -1) return -1;
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux1, &device_param->kernel_dynamic_local_mem_size_aux1) == -1) return -1;
 
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, CL_TRUE, 0, size_rules, straight_ctx->kernel_rules_buf, 0, NULL, NULL) == -1) return -1;
-        }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
-        {
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_combs,      NULL, &device_param->opencl_d_combs)          == -1) return -1;
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_combs,      NULL, &device_param->opencl_d_combs_c)        == -1) return -1;
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_root_css,   NULL, &device_param->opencl_d_root_css_buf)   == -1) return -1;
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_markov_css, NULL, &device_param->opencl_d_markov_css_buf) == -1) return -1;
+          device_param->kernel_preferred_wgs_multiple_aux1 = device_param->cuda_warp_size;
         }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+
+        // aux2
+
+        if (hashconfig->opts_type & OPTS_TYPE_AUX2)
         {
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_bfs,        NULL, &device_param->opencl_d_bfs)            == -1) return -1;
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_bfs,        NULL, &device_param->opencl_d_bfs_c)          == -1) return -1;
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_tm,         NULL, &device_param->opencl_d_tm_c)           == -1) return -1;
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_root_css,   NULL, &device_param->opencl_d_root_css_buf)   == -1) return -1;
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_markov_css, NULL, &device_param->opencl_d_markov_css_buf) == -1) return -1;
-        }
-      }
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux2", kern_type);
 
-      if (size_esalts)
-      {
-        if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_esalts, NULL, &device_param->opencl_d_esalt_bufs) == -1) return -1;
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux2, device_param->cuda_module, kernel_name) == -1) return -1;
 
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_esalt_bufs, CL_TRUE, 0, size_esalts, hashes->esalts_buf, 0, NULL, NULL) == -1) return -1;
-      }
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux2, &device_param->kernel_wgs_aux2) == -1) return -1;
 
-      if (hashconfig->st_hash != NULL)
-      {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_digests_buf,  CL_TRUE, 0, size_st_digests,         hashes->st_digests_buf,  0, NULL, NULL) == -1) return -1;
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_salts_buf,    CL_TRUE, 0, size_st_salts,           hashes->st_salts_buf,    0, NULL, NULL) == -1) return -1;
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux2, &device_param->kernel_local_mem_size_aux2) == -1) return -1;
 
-        if (size_esalts)
-        {
-          if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_st_esalts, NULL, &device_param->opencl_d_st_esalts_buf) == -1) return -1;
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux2, &device_param->kernel_dynamic_local_mem_size_aux2) == -1) return -1;
 
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_esalts_buf, CL_TRUE, 0, size_st_esalts, hashes->st_esalts_buf, 0, NULL, NULL) == -1) return -1;
+          device_param->kernel_preferred_wgs_multiple_aux2 = device_param->cuda_warp_size;
         }
-      }
-    }
 
-    /**
-     * kernel args
-     */
+        // aux3
 
-    device_param->kernel_params_buf32[24] = bitmap_ctx->bitmap_mask;
-    device_param->kernel_params_buf32[25] = bitmap_ctx->bitmap_shift1;
-    device_param->kernel_params_buf32[26] = bitmap_ctx->bitmap_shift2;
-    device_param->kernel_params_buf32[27] = 0; // salt_pos
-    device_param->kernel_params_buf32[28] = 0; // loop_pos
-    device_param->kernel_params_buf32[29] = 0; // loop_cnt
-    device_param->kernel_params_buf32[30] = 0; // kernel_rules_cnt
-    device_param->kernel_params_buf32[31] = 0; // digests_cnt
-    device_param->kernel_params_buf32[32] = 0; // digests_offset
-    device_param->kernel_params_buf32[33] = 0; // combs_mode
-    device_param->kernel_params_buf64[34] = 0; // gid_max
+        if (hashconfig->opts_type & OPTS_TYPE_AUX3)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux3", kern_type);
 
-    if (device_param->is_cuda == true)
-    {
-      device_param->kernel_params[ 0] = NULL; // &device_param->cuda_d_pws_buf;
-      device_param->kernel_params[ 1] = &device_param->cuda_d_rules_c;
-      device_param->kernel_params[ 2] = &device_param->cuda_d_combs_c;
-      device_param->kernel_params[ 3] = &device_param->cuda_d_bfs_c;
-      device_param->kernel_params[ 4] = NULL; // &device_param->cuda_d_tmps;
-      device_param->kernel_params[ 5] = NULL; // &device_param->cuda_d_hooks;
-      device_param->kernel_params[ 6] = &device_param->cuda_d_bitmap_s1_a;
-      device_param->kernel_params[ 7] = &device_param->cuda_d_bitmap_s1_b;
-      device_param->kernel_params[ 8] = &device_param->cuda_d_bitmap_s1_c;
-      device_param->kernel_params[ 9] = &device_param->cuda_d_bitmap_s1_d;
-      device_param->kernel_params[10] = &device_param->cuda_d_bitmap_s2_a;
-      device_param->kernel_params[11] = &device_param->cuda_d_bitmap_s2_b;
-      device_param->kernel_params[12] = &device_param->cuda_d_bitmap_s2_c;
-      device_param->kernel_params[13] = &device_param->cuda_d_bitmap_s2_d;
-      device_param->kernel_params[14] = &device_param->cuda_d_plain_bufs;
-      device_param->kernel_params[15] = &device_param->cuda_d_digests_buf;
-      device_param->kernel_params[16] = &device_param->cuda_d_digests_shown;
-      device_param->kernel_params[17] = &device_param->cuda_d_salt_bufs;
-      device_param->kernel_params[18] = &device_param->cuda_d_esalt_bufs;
-      device_param->kernel_params[19] = &device_param->cuda_d_result;
-      device_param->kernel_params[20] = &device_param->cuda_d_extra0_buf;
-      device_param->kernel_params[21] = &device_param->cuda_d_extra1_buf;
-      device_param->kernel_params[22] = &device_param->cuda_d_extra2_buf;
-      device_param->kernel_params[23] = &device_param->cuda_d_extra3_buf;
-    }
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux3, device_param->cuda_module, kernel_name) == -1) return -1;
 
-    if (device_param->is_opencl == true)
-    {
-      device_param->kernel_params[ 0] = NULL; // &device_param->opencl_d_pws_buf;
-      device_param->kernel_params[ 1] = &device_param->opencl_d_rules_c;
-      device_param->kernel_params[ 2] = &device_param->opencl_d_combs_c;
-      device_param->kernel_params[ 3] = &device_param->opencl_d_bfs_c;
-      device_param->kernel_params[ 4] = NULL; // &device_param->opencl_d_tmps;
-      device_param->kernel_params[ 5] = NULL; // &device_param->opencl_d_hooks;
-      device_param->kernel_params[ 6] = &device_param->opencl_d_bitmap_s1_a;
-      device_param->kernel_params[ 7] = &device_param->opencl_d_bitmap_s1_b;
-      device_param->kernel_params[ 8] = &device_param->opencl_d_bitmap_s1_c;
-      device_param->kernel_params[ 9] = &device_param->opencl_d_bitmap_s1_d;
-      device_param->kernel_params[10] = &device_param->opencl_d_bitmap_s2_a;
-      device_param->kernel_params[11] = &device_param->opencl_d_bitmap_s2_b;
-      device_param->kernel_params[12] = &device_param->opencl_d_bitmap_s2_c;
-      device_param->kernel_params[13] = &device_param->opencl_d_bitmap_s2_d;
-      device_param->kernel_params[14] = &device_param->opencl_d_plain_bufs;
-      device_param->kernel_params[15] = &device_param->opencl_d_digests_buf;
-      device_param->kernel_params[16] = &device_param->opencl_d_digests_shown;
-      device_param->kernel_params[17] = &device_param->opencl_d_salt_bufs;
-      device_param->kernel_params[18] = &device_param->opencl_d_esalt_bufs;
-      device_param->kernel_params[19] = &device_param->opencl_d_result;
-      device_param->kernel_params[20] = &device_param->opencl_d_extra0_buf;
-      device_param->kernel_params[21] = &device_param->opencl_d_extra1_buf;
-      device_param->kernel_params[22] = &device_param->opencl_d_extra2_buf;
-      device_param->kernel_params[23] = &device_param->opencl_d_extra3_buf;
-    }
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux3, &device_param->kernel_wgs_aux3) == -1) return -1;
 
-    device_param->kernel_params[24] = &device_param->kernel_params_buf32[24];
-    device_param->kernel_params[25] = &device_param->kernel_params_buf32[25];
-    device_param->kernel_params[26] = &device_param->kernel_params_buf32[26];
-    device_param->kernel_params[27] = &device_param->kernel_params_buf32[27];
-    device_param->kernel_params[28] = &device_param->kernel_params_buf32[28];
-    device_param->kernel_params[29] = &device_param->kernel_params_buf32[29];
-    device_param->kernel_params[30] = &device_param->kernel_params_buf32[30];
-    device_param->kernel_params[31] = &device_param->kernel_params_buf32[31];
-    device_param->kernel_params[32] = &device_param->kernel_params_buf32[32];
-    device_param->kernel_params[33] = &device_param->kernel_params_buf32[33];
-    device_param->kernel_params[34] = &device_param->kernel_params_buf64[34];
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux3, &device_param->kernel_local_mem_size_aux3) == -1) return -1;
 
-    if (user_options->slow_candidates == true)
-    {
-    }
-    else
-    {
-      device_param->kernel_params_mp_buf64[3] = 0;
-      device_param->kernel_params_mp_buf32[4] = 0;
-      device_param->kernel_params_mp_buf32[5] = 0;
-      device_param->kernel_params_mp_buf32[6] = 0;
-      device_param->kernel_params_mp_buf32[7] = 0;
-      device_param->kernel_params_mp_buf64[8] = 0;
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux3, &device_param->kernel_dynamic_local_mem_size_aux3) == -1) return -1;
 
-      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-      {
-        if (device_param->is_cuda == true)
-        {
-          device_param->kernel_params_mp[0] = &device_param->cuda_d_combs;
+          device_param->kernel_preferred_wgs_multiple_aux3 = device_param->cuda_warp_size;
         }
 
-        if (device_param->is_opencl == true)
+        // aux4
+
+        if (hashconfig->opts_type & OPTS_TYPE_AUX4)
         {
-          device_param->kernel_params_mp[0] = &device_param->opencl_d_combs;
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux4", kern_type);
+
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux4, device_param->cuda_module, kernel_name) == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux4, &device_param->kernel_wgs_aux4) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux4, &device_param->kernel_local_mem_size_aux4) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux4, &device_param->kernel_dynamic_local_mem_size_aux4) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_aux4 = device_param->cuda_warp_size;
         }
       }
+
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem),   device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem),   device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem),   device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]); if (CL_rc == -1) return -1;
+
+      // MP start
+
+      if (user_options->slow_candidates == true)
+      {
+      }
       else
       {
-        if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+        if (user_options->attack_mode == ATTACK_MODE_BF)
         {
-          if (device_param->is_cuda == true)
-          {
-            device_param->kernel_params_mp[0] = &device_param->cuda_d_combs;
-          }
+          // mp_l
 
-          if (device_param->is_opencl == true)
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp_l, device_param->cuda_module_mp, "l_markov") == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp_l, &device_param->kernel_wgs_mp_l) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_l, &device_param->kernel_local_mem_size_mp_l) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_l, &device_param->kernel_dynamic_local_mem_size_mp_l) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_mp_l = device_param->cuda_warp_size;
+
+          // mp_r
+
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp_r, device_param->cuda_module_mp, "r_markov") == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp_r, &device_param->kernel_wgs_mp_r) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_r, &device_param->kernel_local_mem_size_mp_r) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_r, &device_param->kernel_dynamic_local_mem_size_mp_r) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_mp_r = device_param->cuda_warp_size;
+
+          if (user_options->attack_mode == ATTACK_MODE_BF)
           {
-            device_param->kernel_params_mp[0] = &device_param->opencl_d_combs;
+            if (hashconfig->opts_type & OPTS_TYPE_TM_KERNEL)
+            {
+              //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_tm, 0, sizeof (cl_mem), device_param->kernel_params_tm[0]); if (CL_rc == -1) return -1;
+              //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_tm, 1, sizeof (cl_mem), device_param->kernel_params_tm[1]); if (CL_rc == -1) return -1;
+            }
           }
         }
-        else
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
         {
-          device_param->kernel_params_mp[0] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-                                                    // ? &device_param->opencl_d_pws_buf
-                                                    // : &device_param->opencl_d_pws_amp_buf;
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp, device_param->cuda_module_mp, "C_markov") == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_wgs_mp) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_local_mem_size_mp) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_dynamic_local_mem_size_mp) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_mp = device_param->cuda_warp_size;
         }
-      }
+        else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+        {
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp, device_param->cuda_module_mp, "C_markov") == -1) return -1;
 
-      if (device_param->is_cuda == true)
-      {
-        device_param->kernel_params_mp[1] = &device_param->cuda_d_root_css_buf;
-        device_param->kernel_params_mp[2] = &device_param->cuda_d_markov_css_buf;
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_wgs_mp) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_local_mem_size_mp) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_dynamic_local_mem_size_mp) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_mp = device_param->cuda_warp_size;
+        }
       }
 
-      if (device_param->is_opencl == true)
+      if (user_options->slow_candidates == true)
       {
-        device_param->kernel_params_mp[1] = &device_param->opencl_d_root_css_buf;
-        device_param->kernel_params_mp[2] = &device_param->opencl_d_markov_css_buf;
       }
+      else
+      {
+        if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+        {
+          // nothing to do
+        }
+        else
+        {
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_amp, device_param->cuda_module_amp, "amp") == -1) return -1;
 
-      device_param->kernel_params_mp[3] = &device_param->kernel_params_mp_buf64[3];
-      device_param->kernel_params_mp[4] = &device_param->kernel_params_mp_buf32[4];
-      device_param->kernel_params_mp[5] = &device_param->kernel_params_mp_buf32[5];
-      device_param->kernel_params_mp[6] = &device_param->kernel_params_mp_buf32[6];
-      device_param->kernel_params_mp[7] = &device_param->kernel_params_mp_buf32[7];
-      device_param->kernel_params_mp[8] = &device_param->kernel_params_mp_buf64[8];
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_amp, &device_param->kernel_wgs_amp) == -1) return -1;
 
-      device_param->kernel_params_mp_l_buf64[3] = 0;
-      device_param->kernel_params_mp_l_buf32[4] = 0;
-      device_param->kernel_params_mp_l_buf32[5] = 0;
-      device_param->kernel_params_mp_l_buf32[6] = 0;
-      device_param->kernel_params_mp_l_buf32[7] = 0;
-      device_param->kernel_params_mp_l_buf32[8] = 0;
-      device_param->kernel_params_mp_l_buf64[9] = 0;
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_amp, &device_param->kernel_local_mem_size_amp) == -1) return -1;
 
-      device_param->kernel_params_mp_l[0] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-                                                  // ? &device_param->opencl_d_pws_buf
-                                                  // : &device_param->opencl_d_pws_amp_buf;
-      if (device_param->is_cuda == true)
-      {
-        device_param->kernel_params_mp_l[1] = &device_param->cuda_d_root_css_buf;
-        device_param->kernel_params_mp_l[2] = &device_param->cuda_d_markov_css_buf;
-      }
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_amp, &device_param->kernel_dynamic_local_mem_size_amp) == -1) return -1;
 
-      if (device_param->is_opencl == true)
-      {
-        device_param->kernel_params_mp_l[1] = &device_param->opencl_d_root_css_buf;
-        device_param->kernel_params_mp_l[2] = &device_param->opencl_d_markov_css_buf;
-      }
+          device_param->kernel_preferred_wgs_multiple_amp = device_param->cuda_warp_size;
+        }
 
-      device_param->kernel_params_mp_l[3] = &device_param->kernel_params_mp_l_buf64[3];
-      device_param->kernel_params_mp_l[4] = &device_param->kernel_params_mp_l_buf32[4];
-      device_param->kernel_params_mp_l[5] = &device_param->kernel_params_mp_l_buf32[5];
-      device_param->kernel_params_mp_l[6] = &device_param->kernel_params_mp_l_buf32[6];
-      device_param->kernel_params_mp_l[7] = &device_param->kernel_params_mp_l_buf32[7];
-      device_param->kernel_params_mp_l[8] = &device_param->kernel_params_mp_l_buf32[8];
-      device_param->kernel_params_mp_l[9] = &device_param->kernel_params_mp_l_buf64[9];
+        /*
+        if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+        {
+          // nothing to do
+        }
+        else
+        {
+          for (u32 i = 0; i < 5; i++)
+          {
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, i, sizeof (cl_mem), device_param->kernel_params_amp[i]);
 
-      device_param->kernel_params_mp_r_buf64[3] = 0;
-      device_param->kernel_params_mp_r_buf32[4] = 0;
-      device_param->kernel_params_mp_r_buf32[5] = 0;
-      device_param->kernel_params_mp_r_buf32[6] = 0;
-      device_param->kernel_params_mp_r_buf32[7] = 0;
-      device_param->kernel_params_mp_r_buf64[8] = 0;
+            //if (CL_rc == -1) return -1;
+          }
 
-      if (device_param->is_cuda == true)
-      {
-        device_param->kernel_params_mp_r[0] = &device_param->cuda_d_bfs;
-        device_param->kernel_params_mp_r[1] = &device_param->cuda_d_root_css_buf;
-        device_param->kernel_params_mp_r[2] = &device_param->cuda_d_markov_css_buf;
-      }
+          for (u32 i = 5; i < 6; i++)
+          {
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, i, sizeof (cl_uint), device_param->kernel_params_amp[i]);
 
-      if (device_param->is_opencl == true)
-      {
-        device_param->kernel_params_mp_r[0] = &device_param->opencl_d_bfs;
-        device_param->kernel_params_mp_r[1] = &device_param->opencl_d_root_css_buf;
-        device_param->kernel_params_mp_r[2] = &device_param->opencl_d_markov_css_buf;
+            //if (CL_rc == -1) return -1;
+          }
+
+          for (u32 i = 6; i < 7; i++)
+          {
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, i, sizeof (cl_ulong), device_param->kernel_params_amp[i]);
+
+            //if (CL_rc == -1) return -1;
+          }
+        }
+        */
       }
 
-      device_param->kernel_params_mp_r[3] = &device_param->kernel_params_mp_r_buf64[3];
-      device_param->kernel_params_mp_r[4] = &device_param->kernel_params_mp_r_buf32[4];
-      device_param->kernel_params_mp_r[5] = &device_param->kernel_params_mp_r_buf32[5];
-      device_param->kernel_params_mp_r[6] = &device_param->kernel_params_mp_r_buf32[6];
-      device_param->kernel_params_mp_r[7] = &device_param->kernel_params_mp_r_buf32[7];
-      device_param->kernel_params_mp_r[8] = &device_param->kernel_params_mp_r_buf64[8];
+      // zero some data buffers
 
-      device_param->kernel_params_amp_buf32[5] = 0; // combs_mode
-      device_param->kernel_params_amp_buf64[6] = 0; // gid_max
+      if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_plain_bufs,    device_param->size_plains)  == -1) return -1;
+      if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_digests_shown, device_param->size_shown)   == -1) return -1;
+      if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_result,        device_param->size_results) == -1) return -1;
 
-      if (device_param->is_cuda == true)
-      {
-        device_param->kernel_params_amp[0] = NULL; // &device_param->cuda_d_pws_buf;
-        device_param->kernel_params_amp[1] = NULL; // &device_param->cuda_d_pws_amp_buf;
-        device_param->kernel_params_amp[2] = &device_param->cuda_d_rules_c;
-        device_param->kernel_params_amp[3] = &device_param->cuda_d_combs_c;
-        device_param->kernel_params_amp[4] = &device_param->cuda_d_bfs_c;
-      }
+      /**
+       * special buffers
+       */
 
-      if (device_param->is_opencl == true)
+      if (user_options->slow_candidates == true)
       {
-        device_param->kernel_params_amp[0] = NULL; // &device_param->opencl_d_pws_buf;
-        device_param->kernel_params_amp[1] = NULL; // &device_param->opencl_d_pws_amp_buf;
-        device_param->kernel_params_amp[2] = &device_param->opencl_d_rules_c;
-        device_param->kernel_params_amp[3] = &device_param->opencl_d_combs_c;
-        device_param->kernel_params_amp[4] = &device_param->opencl_d_bfs_c;
+        if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_rules_c, size_rules_c) == -1) return -1;
       }
-
-      device_param->kernel_params_amp[5] = &device_param->kernel_params_amp_buf32[5];
-      device_param->kernel_params_amp[6] = &device_param->kernel_params_amp_buf64[6];
-
-      if (device_param->is_cuda == true)
+      else
       {
-        device_param->kernel_params_tm[0] = &device_param->cuda_d_bfs_c;
-        device_param->kernel_params_tm[1] = &device_param->cuda_d_tm_c;
+        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
+        {
+          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_rules_c, size_rules_c) == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
+        {
+          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_combs,          size_combs)       == -1) return -1;
+          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_combs_c,        size_combs)       == -1) return -1;
+          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_root_css_buf,   size_root_css)    == -1) return -1;
+          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_markov_css_buf, size_markov_css)  == -1) return -1;
+        }
+        else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
+        {
+          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_bfs,            size_bfs)         == -1) return -1;
+          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_bfs_c,          size_bfs)         == -1) return -1;
+          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tm_c,           size_tm)          == -1) return -1;
+          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_root_css_buf,   size_root_css)    == -1) return -1;
+          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_markov_css_buf, size_markov_css)  == -1) return -1;
+        }
       }
 
-      if (device_param->is_opencl == true)
+      if (user_options->slow_candidates == true)
       {
-        device_param->kernel_params_tm[0] = &device_param->opencl_d_bfs_c;
-        device_param->kernel_params_tm[1] = &device_param->opencl_d_tm_c;
       }
-    }
-
-    device_param->kernel_params_memset_buf32[1] = 0; // value
-    device_param->kernel_params_memset_buf64[2] = 0; // gid_max
+      else
+      {
+        if ((user_options->attack_mode == ATTACK_MODE_HYBRID1) || (user_options->attack_mode == ATTACK_MODE_HYBRID2))
+        {
+          /**
+           * prepare mp
+           */
 
-    device_param->kernel_params_memset[0] = NULL;
-    device_param->kernel_params_memset[1] = &device_param->kernel_params_memset_buf32[1];
-    device_param->kernel_params_memset[2] = &device_param->kernel_params_memset_buf64[2];
+          if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
+          {
+            device_param->kernel_params_mp_buf32[5] = 0;
+            device_param->kernel_params_mp_buf32[6] = 0;
+            device_param->kernel_params_mp_buf32[7] = 0;
 
-    device_param->kernel_params_atinit_buf64[1] = 0; // gid_max
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)     device_param->kernel_params_mp_buf32[5] = full01;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)     device_param->kernel_params_mp_buf32[5] = full06;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)     device_param->kernel_params_mp_buf32[5] = full80;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS14) device_param->kernel_params_mp_buf32[6] = 1;
+            if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS15) device_param->kernel_params_mp_buf32[7] = 1;
+          }
+          else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
+          {
+            device_param->kernel_params_mp_buf32[5] = 0;
+            device_param->kernel_params_mp_buf32[6] = 0;
+            device_param->kernel_params_mp_buf32[7] = 0;
+          }
 
-    device_param->kernel_params_atinit[0] = NULL;
-    device_param->kernel_params_atinit[1] = &device_param->kernel_params_atinit_buf64[1];
+          //for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, i, sizeof (cl_mem), device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
+        }
+        else if (user_options->attack_mode == ATTACK_MODE_BF)
+        {
+          /**
+           * prepare mp_r and mp_l
+           */
 
-    device_param->kernel_params_decompress_buf64[3] = 0; // gid_max
+          device_param->kernel_params_mp_l_buf32[6] = 0;
+          device_param->kernel_params_mp_l_buf32[7] = 0;
+          device_param->kernel_params_mp_l_buf32[8] = 0;
 
-    if (device_param->is_cuda == true)
-    {
-      device_param->kernel_params_decompress[0] = NULL; // &device_param->cuda_d_pws_idx;
-      device_param->kernel_params_decompress[1] = NULL; // &device_param->cuda_d_pws_comp_buf;
-      device_param->kernel_params_decompress[2] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-                                                        // ? &device_param->cuda_d_pws_buf
-                                                        // : &device_param->cuda_d_pws_amp_buf;
-    }
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01)     device_param->kernel_params_mp_l_buf32[6] = full01;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06)     device_param->kernel_params_mp_l_buf32[6] = full06;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80)     device_param->kernel_params_mp_l_buf32[6] = full80;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS14) device_param->kernel_params_mp_l_buf32[7] = 1;
+          if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS15) device_param->kernel_params_mp_l_buf32[8] = 1;
 
-    if (device_param->is_opencl == true)
-    {
-      device_param->kernel_params_decompress[0] = NULL; // &device_param->opencl_d_pws_idx;
-      device_param->kernel_params_decompress[1] = NULL; // &device_param->opencl_d_pws_comp_buf;
-      device_param->kernel_params_decompress[2] = NULL; // (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-                                                        // ? &device_param->opencl_d_pws_buf
-                                                        // : &device_param->opencl_d_pws_amp_buf;
+          //for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_mem), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+          //for (u32 i = 0; i < 3; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_mem), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+        }
+      }
     }
 
-    device_param->kernel_params_decompress[3] = &device_param->kernel_params_decompress_buf64[3];
-
-    /**
-     * kernel name
-     */
-
-    if (device_param->is_cuda == true)
-    {
+    /*
+    * HIP
+    */
+      if (device_param->is_hip == true)
+      {
       char kernel_name[64] = { 0 };
 
       if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
@@ -8824,57 +12283,57 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
             snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 4);
 
-            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function1, device_param->cuda_module, kernel_name) == -1) return -1;
+            if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function1, device_param->hip_module, kernel_name) == -1) return -1;
 
-            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_wgs1) == -1) return -1;
+            if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function1, &device_param->kernel_wgs1) == -1) return -1;
 
-            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1) == -1) return -1;
+            if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function1, &device_param->kernel_local_mem_size1) == -1) return -1;
 
-            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1;
+            if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1;
 
-            device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size;
+            device_param->kernel_preferred_wgs_multiple1 = device_param->hip_warp_size;
 
             // kernel2
 
             snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 8);
 
-            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2, device_param->cuda_module, kernel_name) == -1) return -1;
+            if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function2, device_param->hip_module, kernel_name) == -1) return -1;
 
-            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_wgs2) == -1) return -1;
+            if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function2, &device_param->kernel_wgs2) == -1) return -1;
 
-            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2) == -1) return -1;
+            if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function2, &device_param->kernel_local_mem_size2) == -1) return -1;
 
-            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1;
+            if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1;
 
-            device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size;
+            device_param->kernel_preferred_wgs_multiple2 = device_param->hip_warp_size;
 
             // kernel3
 
             snprintf (kernel_name, sizeof (kernel_name), "m%05u_s%02d", kern_type, 16);
 
-            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function3, device_param->cuda_module, kernel_name) == -1) return -1;
+            if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function3, device_param->hip_module, kernel_name) == -1) return -1;
 
-            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_wgs3) == -1) return -1;
+            if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function3, &device_param->kernel_wgs3) == -1) return -1;
 
-            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3) == -1) return -1;
+            if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function3, &device_param->kernel_local_mem_size3) == -1) return -1;
 
-            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1;
+            if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1;
 
-            device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size;
+            device_param->kernel_preferred_wgs_multiple3 = device_param->hip_warp_size;
           }
           else
           {
             snprintf (kernel_name, sizeof (kernel_name), "m%05u_sxx", kern_type);
 
-            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function4, device_param->cuda_module, kernel_name) == -1) return -1;
+            if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function4, device_param->hip_module, kernel_name) == -1) return -1;
 
-            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_wgs4) == -1) return -1;
+            if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function4, &device_param->kernel_wgs4) == -1) return -1;
 
-            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_local_mem_size4) == -1) return -1;
+            if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function4, &device_param->kernel_local_mem_size4) == -1) return -1;
 
-            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_dynamic_local_mem_size4) == -1) return -1;
+            if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function4, &device_param->kernel_dynamic_local_mem_size4) == -1) return -1;
 
-            device_param->kernel_preferred_wgs_multiple4 = device_param->cuda_warp_size;
+            device_param->kernel_preferred_wgs_multiple4 = device_param->hip_warp_size;
           }
         }
         else
@@ -8885,57 +12344,57 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
             snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 4);
 
-            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function1, device_param->cuda_module, kernel_name) == -1) return -1;
+            if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function1, device_param->hip_module, kernel_name) == -1) return -1;
 
-            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_wgs1) == -1) return -1;
+            if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function1, &device_param->kernel_wgs1) == -1) return -1;
 
-            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1) == -1) return -1;
+            if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function1, &device_param->kernel_local_mem_size1) == -1) return -1;
 
-            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1;
+            if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1;
 
-            device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size;
+            device_param->kernel_preferred_wgs_multiple1 = device_param->hip_warp_size;
 
             // kernel2
 
             snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 8);
 
-            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2, device_param->cuda_module, kernel_name) == -1) return -1;
+            if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function2, device_param->hip_module, kernel_name) == -1) return -1;
 
-            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_wgs2) == -1) return -1;
+            if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function2, &device_param->kernel_wgs2) == -1) return -1;
 
-            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2) == -1) return -1;
+            if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function2, &device_param->kernel_local_mem_size2) == -1) return -1;
 
-            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1;
+            if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1;
 
-            device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size;
+            device_param->kernel_preferred_wgs_multiple2 = device_param->hip_warp_size;
 
             // kernel3
 
             snprintf (kernel_name, sizeof (kernel_name), "m%05u_m%02d", kern_type, 16);
 
-            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function3, device_param->cuda_module, kernel_name) == -1) return -1;
+            if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function3, device_param->hip_module, kernel_name) == -1) return -1;
 
-            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_wgs3) == -1) return -1;
+            if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function3, &device_param->kernel_wgs3) == -1) return -1;
 
-            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3) == -1) return -1;
+            if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function3, &device_param->kernel_local_mem_size3) == -1) return -1;
 
-            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1;
+            if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1;
 
-            device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size;
+            device_param->kernel_preferred_wgs_multiple3 = device_param->hip_warp_size;
           }
           else
           {
             snprintf (kernel_name, sizeof (kernel_name), "m%05u_mxx", kern_type);
 
-            if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function4, device_param->cuda_module, kernel_name) == -1) return -1;
+            if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function4, device_param->hip_module, kernel_name) == -1) return -1;
 
-            if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_wgs4) == -1) return -1;
+            if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function4, &device_param->kernel_wgs4) == -1) return -1;
 
-            if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_local_mem_size4) == -1) return -1;
+            if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function4, &device_param->kernel_local_mem_size4) == -1) return -1;
 
-            if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_dynamic_local_mem_size4) == -1) return -1;
+            if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function4, &device_param->kernel_dynamic_local_mem_size4) == -1) return -1;
 
-            device_param->kernel_preferred_wgs_multiple4 = device_param->cuda_warp_size;
+            device_param->kernel_preferred_wgs_multiple4 = device_param->hip_warp_size;
           }
         }
 
@@ -8950,15 +12409,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
             {
               snprintf (kernel_name, sizeof (kernel_name), "m%05u_tm", kern_type);
 
-              if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_tm, device_param->cuda_module, kernel_name) == -1) return -1;
+              if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_tm, device_param->hip_module, kernel_name) == -1) return -1;
 
-              if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_tm, &device_param->kernel_wgs_tm) == -1) return -1;
+              if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_tm, &device_param->kernel_wgs_tm) == -1) return -1;
 
-              if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_tm, &device_param->kernel_local_mem_size_tm) == -1) return -1;
+              if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_tm, &device_param->kernel_local_mem_size_tm) == -1) return -1;
 
-              if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_tm, &device_param->kernel_dynamic_local_mem_size_tm) == -1) return -1;
+              if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_tm, &device_param->kernel_dynamic_local_mem_size_tm) == -1) return -1;
 
-              device_param->kernel_preferred_wgs_multiple_tm = device_param->cuda_warp_size;
+              device_param->kernel_preferred_wgs_multiple_tm = device_param->hip_warp_size;
             }
           }
         }
@@ -8969,43 +12428,43 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
         snprintf (kernel_name, sizeof (kernel_name), "m%05u_init", kern_type);
 
-        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function1, device_param->cuda_module, kernel_name) == -1) return -1;
+        if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function1, device_param->hip_module, kernel_name) == -1) return -1;
 
-        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_wgs1) == -1) return -1;
+        if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function1, &device_param->kernel_wgs1) == -1) return -1;
 
-        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1) == -1) return -1;
+        if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function1, &device_param->kernel_local_mem_size1) == -1) return -1;
 
-        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1;
+        if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1;
 
-        device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size;
+        device_param->kernel_preferred_wgs_multiple1 = device_param->hip_warp_size;
 
         // kernel2
 
         snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop", kern_type);
 
-        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2, device_param->cuda_module, kernel_name) == -1) return -1;
+        if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function2, device_param->hip_module, kernel_name) == -1) return -1;
 
-        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_wgs2) == -1) return -1;
+        if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function2, &device_param->kernel_wgs2) == -1) return -1;
 
-        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2) == -1) return -1;
+        if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function2, &device_param->kernel_local_mem_size2) == -1) return -1;
 
-        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1;
+        if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1;
 
-        device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size;
+        device_param->kernel_preferred_wgs_multiple2 = device_param->hip_warp_size;
 
         // kernel3
 
         snprintf (kernel_name, sizeof (kernel_name), "m%05u_comp", kern_type);
 
-        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function3, device_param->cuda_module, kernel_name) == -1) return -1;
+        if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function3, device_param->hip_module, kernel_name) == -1) return -1;
 
-        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_wgs3) == -1) return -1;
+        if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function3, &device_param->kernel_wgs3) == -1) return -1;
 
-        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3) == -1) return -1;
+        if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function3, &device_param->kernel_local_mem_size3) == -1) return -1;
 
-        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1;
+        if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1;
 
-        device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size;
+        device_param->kernel_preferred_wgs_multiple3 = device_param->hip_warp_size;
 
         if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
         {
@@ -9013,15 +12472,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
           snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop_extended", kern_type);
 
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2e, device_param->cuda_module, kernel_name) == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function2e, device_param->hip_module, kernel_name) == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2e, &device_param->kernel_wgs2e) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function2e, &device_param->kernel_wgs2e) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2e, &device_param->kernel_local_mem_size2e) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function2e, &device_param->kernel_local_mem_size2e) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2e, &device_param->kernel_dynamic_local_mem_size2e) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function2e, &device_param->kernel_dynamic_local_mem_size2e) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple2e = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple2e = device_param->hip_warp_size;
         }
 
         // kernel12
@@ -9030,15 +12489,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           snprintf (kernel_name, sizeof (kernel_name), "m%05u_hook12", kern_type);
 
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function12, device_param->cuda_module, kernel_name) == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function12, device_param->hip_module, kernel_name) == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function12, &device_param->kernel_wgs12) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function12, &device_param->kernel_wgs12) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function12, &device_param->kernel_local_mem_size12) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function12, &device_param->kernel_local_mem_size12) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function12, &device_param->kernel_dynamic_local_mem_size12) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function12, &device_param->kernel_dynamic_local_mem_size12) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple12 = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple12 = device_param->hip_warp_size;
         }
 
         // kernel23
@@ -9047,15 +12506,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           snprintf (kernel_name, sizeof (kernel_name), "m%05u_hook23", kern_type);
 
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function23, device_param->cuda_module, kernel_name) == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function23, device_param->hip_module, kernel_name) == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function23, &device_param->kernel_wgs23) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function23, &device_param->kernel_wgs23) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function23, &device_param->kernel_local_mem_size23) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function23, &device_param->kernel_local_mem_size23) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function23, &device_param->kernel_dynamic_local_mem_size23) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function23, &device_param->kernel_dynamic_local_mem_size23) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple23 = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple23 = device_param->hip_warp_size;
         }
 
         // init2
@@ -9064,15 +12523,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           snprintf (kernel_name, sizeof (kernel_name), "m%05u_init2", kern_type);
 
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_init2, device_param->cuda_module, kernel_name) == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_init2, device_param->hip_module, kernel_name) == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_init2, &device_param->kernel_wgs_init2) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_init2, &device_param->kernel_wgs_init2) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_init2, &device_param->kernel_local_mem_size_init2) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_init2, &device_param->kernel_local_mem_size_init2) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_init2, &device_param->kernel_dynamic_local_mem_size_init2) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_init2, &device_param->kernel_dynamic_local_mem_size_init2) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple_init2 = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple_init2 = device_param->hip_warp_size;
         }
 
         // loop2
@@ -9081,15 +12540,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop2", kern_type);
 
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_loop2, device_param->cuda_module, kernel_name) == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_loop2, device_param->hip_module, kernel_name) == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_loop2, &device_param->kernel_wgs_loop2) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_loop2, &device_param->kernel_wgs_loop2) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2, &device_param->kernel_local_mem_size_loop2) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_loop2, &device_param->kernel_local_mem_size_loop2) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2, &device_param->kernel_dynamic_local_mem_size_loop2) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_loop2, &device_param->kernel_dynamic_local_mem_size_loop2) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple_loop2 = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple_loop2 = device_param->hip_warp_size;
         }
 
         // aux1
@@ -9098,15 +12557,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux1", kern_type);
 
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux1, device_param->cuda_module, kernel_name) == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_aux1, device_param->hip_module, kernel_name) == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux1, &device_param->kernel_wgs_aux1) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_aux1, &device_param->kernel_wgs_aux1) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux1, &device_param->kernel_local_mem_size_aux1) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_aux1, &device_param->kernel_local_mem_size_aux1) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux1, &device_param->kernel_dynamic_local_mem_size_aux1) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_aux1, &device_param->kernel_dynamic_local_mem_size_aux1) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple_aux1 = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple_aux1 = device_param->hip_warp_size;
         }
 
         // aux2
@@ -9115,15 +12574,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux2", kern_type);
 
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux2, device_param->cuda_module, kernel_name) == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_aux2, device_param->hip_module, kernel_name) == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux2, &device_param->kernel_wgs_aux2) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_aux2, &device_param->kernel_wgs_aux2) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux2, &device_param->kernel_local_mem_size_aux2) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_aux2, &device_param->kernel_local_mem_size_aux2) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux2, &device_param->kernel_dynamic_local_mem_size_aux2) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_aux2, &device_param->kernel_dynamic_local_mem_size_aux2) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple_aux2 = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple_aux2 = device_param->hip_warp_size;
         }
 
         // aux3
@@ -9132,15 +12591,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux3", kern_type);
 
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux3, device_param->cuda_module, kernel_name) == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_aux3, device_param->hip_module, kernel_name) == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux3, &device_param->kernel_wgs_aux3) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_aux3, &device_param->kernel_wgs_aux3) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux3, &device_param->kernel_local_mem_size_aux3) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_aux3, &device_param->kernel_local_mem_size_aux3) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux3, &device_param->kernel_dynamic_local_mem_size_aux3) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_aux3, &device_param->kernel_dynamic_local_mem_size_aux3) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple_aux3 = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple_aux3 = device_param->hip_warp_size;
         }
 
         // aux4
@@ -9149,15 +12608,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           snprintf (kernel_name, sizeof (kernel_name), "m%05u_aux4", kern_type);
 
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_aux4, device_param->cuda_module, kernel_name) == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_aux4, device_param->hip_module, kernel_name) == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_aux4, &device_param->kernel_wgs_aux4) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_aux4, &device_param->kernel_wgs_aux4) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux4, &device_param->kernel_local_mem_size_aux4) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_aux4, &device_param->kernel_local_mem_size_aux4) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux4, &device_param->kernel_dynamic_local_mem_size_aux4) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_aux4, &device_param->kernel_dynamic_local_mem_size_aux4) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple_aux4 = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple_aux4 = device_param->hip_warp_size;
         }
       }
 
@@ -9177,27 +12636,27 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           // mp_l
 
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp_l, device_param->cuda_module_mp, "l_markov") == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_mp_l, device_param->hip_module_mp, "l_markov") == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp_l, &device_param->kernel_wgs_mp_l) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_mp_l, &device_param->kernel_wgs_mp_l) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_l, &device_param->kernel_local_mem_size_mp_l) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_mp_l, &device_param->kernel_local_mem_size_mp_l) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_l, &device_param->kernel_dynamic_local_mem_size_mp_l) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_mp_l, &device_param->kernel_dynamic_local_mem_size_mp_l) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple_mp_l = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple_mp_l = device_param->hip_warp_size;
 
           // mp_r
 
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp_r, device_param->cuda_module_mp, "r_markov") == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_mp_r, device_param->hip_module_mp, "r_markov") == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp_r, &device_param->kernel_wgs_mp_r) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_mp_r, &device_param->kernel_wgs_mp_r) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_r, &device_param->kernel_local_mem_size_mp_r) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_mp_r, &device_param->kernel_local_mem_size_mp_r) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_r, &device_param->kernel_dynamic_local_mem_size_mp_r) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_mp_r, &device_param->kernel_dynamic_local_mem_size_mp_r) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple_mp_r = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple_mp_r = device_param->hip_warp_size;
 
           if (user_options->attack_mode == ATTACK_MODE_BF)
           {
@@ -9210,27 +12669,27 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         }
         else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
         {
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp, device_param->cuda_module_mp, "C_markov") == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_mp, device_param->hip_module_mp, "C_markov") == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_wgs_mp) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_mp, &device_param->kernel_wgs_mp) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_local_mem_size_mp) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_mp, &device_param->kernel_local_mem_size_mp) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_dynamic_local_mem_size_mp) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_mp, &device_param->kernel_dynamic_local_mem_size_mp) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple_mp = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple_mp = device_param->hip_warp_size;
         }
         else if (user_options->attack_mode == ATTACK_MODE_HYBRID2)
         {
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_mp, device_param->cuda_module_mp, "C_markov") == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_mp, device_param->hip_module_mp, "C_markov") == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_wgs_mp) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_mp, &device_param->kernel_wgs_mp) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_local_mem_size_mp) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_mp, &device_param->kernel_local_mem_size_mp) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_dynamic_local_mem_size_mp) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_mp, &device_param->kernel_dynamic_local_mem_size_mp) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple_mp = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple_mp = device_param->hip_warp_size;
         }
       }
 
@@ -9245,15 +12704,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         }
         else
         {
-          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_amp, device_param->cuda_module_amp, "amp") == -1) return -1;
+          if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_amp, device_param->hip_module_amp, "amp") == -1) return -1;
 
-          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_amp, &device_param->kernel_wgs_amp) == -1) return -1;
+          if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_amp, &device_param->kernel_wgs_amp) == -1) return -1;
 
-          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_amp, &device_param->kernel_local_mem_size_amp) == -1) return -1;
+          if (get_hip_kernel_local_mem_size (hashcat_ctx, device_param->hip_function_amp, &device_param->kernel_local_mem_size_amp) == -1) return -1;
 
-          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_amp, &device_param->kernel_dynamic_local_mem_size_amp) == -1) return -1;
+          if (get_hip_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->hip_function_amp, &device_param->kernel_dynamic_local_mem_size_amp) == -1) return -1;
 
-          device_param->kernel_preferred_wgs_multiple_amp = device_param->cuda_warp_size;
+          device_param->kernel_preferred_wgs_multiple_amp = device_param->hip_warp_size;
         }
 
         /*
@@ -9289,9 +12748,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
       // zero some data buffers
 
-      if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_plain_bufs,    device_param->size_plains)  == -1) return -1;
-      if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_digests_shown, device_param->size_shown)   == -1) return -1;
-      if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_result,        device_param->size_results) == -1) return -1;
+      if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_plain_bufs,    device_param->size_plains)  == -1) return -1;
+      if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_digests_shown, device_param->size_shown)   == -1) return -1;
+      if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_result,        device_param->size_results) == -1) return -1;
 
       /**
        * special buffers
@@ -9299,28 +12758,28 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
       if (user_options->slow_candidates == true)
       {
-        if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_rules_c, size_rules_c) == -1) return -1;
+        if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_rules_c, size_rules_c) == -1) return -1;
       }
       else
       {
         if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
         {
-          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_rules_c, size_rules_c) == -1) return -1;
+          if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_rules_c, size_rules_c) == -1) return -1;
         }
         else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
         {
-          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_combs,          size_combs)       == -1) return -1;
-          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_combs_c,        size_combs)       == -1) return -1;
-          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_root_css_buf,   size_root_css)    == -1) return -1;
-          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_markov_css_buf, size_markov_css)  == -1) return -1;
+          if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_combs,          size_combs)       == -1) return -1;
+          if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_combs_c,        size_combs)       == -1) return -1;
+          if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_root_css_buf,   size_root_css)    == -1) return -1;
+          if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_markov_css_buf, size_markov_css)  == -1) return -1;
         }
         else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
         {
-          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_bfs,            size_bfs)         == -1) return -1;
-          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_bfs_c,          size_bfs)         == -1) return -1;
-          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tm_c,           size_tm)          == -1) return -1;
-          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_root_css_buf,   size_root_css)    == -1) return -1;
-          if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_markov_css_buf, size_markov_css)  == -1) return -1;
+          if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_bfs,            size_bfs)         == -1) return -1;
+          if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_bfs_c,          size_bfs)         == -1) return -1;
+          if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_tm_c,           size_tm)          == -1) return -1;
+          if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_root_css_buf,   size_root_css)    == -1) return -1;
+          if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_markov_css_buf, size_markov_css)  == -1) return -1;
         }
       }
 
@@ -9378,6 +12837,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       }
     }
 
+    /*
+    * OCL
+    */
     if (device_param->is_opencl == true)
     {
       // GPU memset
@@ -10177,6 +13639,29 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_hooks,         device_param->size_hooks)    == -1) return -1;
     }
 
+    /*
+    * HIP
+    */
+    if (device_param->is_hip == true)
+    {
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_pws_buf,      size_pws)      == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_pws_amp_buf,  size_pws_amp)  == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_pws_comp_buf, size_pws_comp) == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_pws_idx,      size_pws_idx)  == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_tmps,         size_tmps)     == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_hooks,        size_hooks)    == -1) return -1;
+
+      if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_pws_buf,       device_param->size_pws)      == -1) return -1;
+      if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_pws_amp_buf,   device_param->size_pws_amp)  == -1) return -1;
+      if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_pws_comp_buf,  device_param->size_pws_comp) == -1) return -1;
+      if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_pws_idx,       device_param->size_pws_idx)  == -1) return -1;
+      if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_tmps,          device_param->size_tmps)     == -1) return -1;
+      if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_hooks,         device_param->size_hooks)    == -1) return -1;
+    }
+
+    /*
+    * OCL
+    */
     if (device_param->is_opencl == true)
     {
       if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_pws,      NULL, &device_param->opencl_d_pws_buf)      == -1) return -1;
@@ -10248,6 +13733,13 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       device_param->kernel_params[ 5] = &device_param->cuda_d_hooks;
     }
 
+    if (device_param->is_hip == true)
+    {
+      device_param->kernel_params[ 0] = &device_param->hip_d_pws_buf;
+      device_param->kernel_params[ 4] = &device_param->hip_d_tmps;
+      device_param->kernel_params[ 5] = &device_param->hip_d_hooks;
+    }
+
     if (device_param->is_opencl == true)
     {
       device_param->kernel_params[ 0] = &device_param->opencl_d_pws_buf;
@@ -10277,6 +13769,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
             //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, 0, sizeof (cl_mem), device_param->kernel_params_mp[0]); if (CL_rc == -1) return -1;
           }
 
+          if (device_param->is_hip == true)
+          {
+            device_param->kernel_params_mp[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                              ? &device_param->hip_d_pws_buf
+                                              : &device_param->hip_d_pws_amp_buf;
+
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, 0, sizeof (cl_mem), device_param->kernel_params_mp[0]); if (CL_rc == -1) return -1;
+          }
+
           if (device_param->is_opencl == true)
           {
             device_param->kernel_params_mp[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
@@ -10299,6 +13800,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
           //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, 0, sizeof (cl_mem), device_param->kernel_params_mp_l[0]); if (CL_rc == -1) return -1;
         }
 
+        if (device_param->is_hip == true)
+        {
+          device_param->kernel_params_mp_l[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                              ? &device_param->hip_d_pws_buf
+                                              : &device_param->hip_d_pws_amp_buf;
+
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, 0, sizeof (cl_mem), device_param->kernel_params_mp_l[0]); if (CL_rc == -1) return -1;
+        }
+
         if (device_param->is_opencl == true)
         {
           device_param->kernel_params_mp_l[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
@@ -10324,6 +13834,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
           //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 1, sizeof (cl_mem), device_param->kernel_params_amp[1]); if (CL_rc == -1) return -1;
         }
 
+        if (device_param->is_hip == true)
+        {
+          device_param->kernel_params_amp[0] = &device_param->hip_d_pws_buf;
+          device_param->kernel_params_amp[1] = &device_param->hip_d_pws_amp_buf;
+
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 0, sizeof (cl_mem), device_param->kernel_params_amp[0]); if (CL_rc == -1) return -1;
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 1, sizeof (cl_mem), device_param->kernel_params_amp[1]); if (CL_rc == -1) return -1;
+        }
+
         if (device_param->is_opencl == true)
         {
           device_param->kernel_params_amp[0] = &device_param->opencl_d_pws_buf;
@@ -10348,6 +13867,19 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1;
     }
 
+    if (device_param->is_hip == true)
+    {
+      device_param->kernel_params_decompress[0] = &device_param->hip_d_pws_idx;
+      device_param->kernel_params_decompress[1] = &device_param->hip_d_pws_comp_buf;
+      device_param->kernel_params_decompress[2] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                ? &device_param->hip_d_pws_buf
+                                                : &device_param->hip_d_pws_amp_buf;
+
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem), device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1;
+    }
+
     if (device_param->is_opencl == true)
     {
       device_param->kernel_params_decompress[0] = &device_param->opencl_d_pws_idx;
@@ -10518,6 +14050,128 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->cuda_context              = NULL;
     }
 
+    /*
+    * HIP
+    */
+    if (device_param->is_hip == true)
+    {
+      if (device_param->hip_d_pws_buf)        hc_hipMemFree (hashcat_ctx, device_param->hip_d_pws_buf);
+      if (device_param->hip_d_pws_amp_buf)    hc_hipMemFree (hashcat_ctx, device_param->hip_d_pws_amp_buf);
+      if (device_param->hip_d_pws_comp_buf)   hc_hipMemFree (hashcat_ctx, device_param->hip_d_pws_comp_buf);
+      if (device_param->hip_d_pws_idx)        hc_hipMemFree (hashcat_ctx, device_param->hip_d_pws_idx);
+      if (device_param->hip_d_rules)          hc_hipMemFree (hashcat_ctx, device_param->hip_d_rules);
+      //if (device_param->hip_d_rules_c)        hc_hipMemFree (hashcat_ctx, device_param->hip_d_rules_c);
+      if (device_param->hip_d_combs)          hc_hipMemFree (hashcat_ctx, device_param->hip_d_combs);
+      if (device_param->hip_d_combs_c)        hc_hipMemFree (hashcat_ctx, device_param->hip_d_combs_c);
+      if (device_param->hip_d_bfs)            hc_hipMemFree (hashcat_ctx, device_param->hip_d_bfs);
+      //if (device_param->hip_d_bfs_c)          hc_hipMemFree (hashcat_ctx, device_param->hip_d_bfs_c);
+      if (device_param->hip_d_bitmap_s1_a)    hc_hipMemFree (hashcat_ctx, device_param->hip_d_bitmap_s1_a);
+      if (device_param->hip_d_bitmap_s1_b)    hc_hipMemFree (hashcat_ctx, device_param->hip_d_bitmap_s1_b);
+      if (device_param->hip_d_bitmap_s1_c)    hc_hipMemFree (hashcat_ctx, device_param->hip_d_bitmap_s1_c);
+      if (device_param->hip_d_bitmap_s1_d)    hc_hipMemFree (hashcat_ctx, device_param->hip_d_bitmap_s1_d);
+      if (device_param->hip_d_bitmap_s2_a)    hc_hipMemFree (hashcat_ctx, device_param->hip_d_bitmap_s2_a);
+      if (device_param->hip_d_bitmap_s2_b)    hc_hipMemFree (hashcat_ctx, device_param->hip_d_bitmap_s2_b);
+      if (device_param->hip_d_bitmap_s2_c)    hc_hipMemFree (hashcat_ctx, device_param->hip_d_bitmap_s2_c);
+      if (device_param->hip_d_bitmap_s2_d)    hc_hipMemFree (hashcat_ctx, device_param->hip_d_bitmap_s2_d);
+      if (device_param->hip_d_plain_bufs)     hc_hipMemFree (hashcat_ctx, device_param->hip_d_plain_bufs);
+      if (device_param->hip_d_digests_buf)    hc_hipMemFree (hashcat_ctx, device_param->hip_d_digests_buf);
+      if (device_param->hip_d_digests_shown)  hc_hipMemFree (hashcat_ctx, device_param->hip_d_digests_shown);
+      if (device_param->hip_d_salt_bufs)      hc_hipMemFree (hashcat_ctx, device_param->hip_d_salt_bufs);
+      if (device_param->hip_d_esalt_bufs)     hc_hipMemFree (hashcat_ctx, device_param->hip_d_esalt_bufs);
+      if (device_param->hip_d_tmps)           hc_hipMemFree (hashcat_ctx, device_param->hip_d_tmps);
+      if (device_param->hip_d_hooks)          hc_hipMemFree (hashcat_ctx, device_param->hip_d_hooks);
+      if (device_param->hip_d_result)         hc_hipMemFree (hashcat_ctx, device_param->hip_d_result);
+      if (device_param->hip_d_extra0_buf)     hc_hipMemFree (hashcat_ctx, device_param->hip_d_extra0_buf);
+      if (device_param->hip_d_extra1_buf)     hc_hipMemFree (hashcat_ctx, device_param->hip_d_extra1_buf);
+      if (device_param->hip_d_extra2_buf)     hc_hipMemFree (hashcat_ctx, device_param->hip_d_extra2_buf);
+      if (device_param->hip_d_extra3_buf)     hc_hipMemFree (hashcat_ctx, device_param->hip_d_extra3_buf);
+      if (device_param->hip_d_root_css_buf)   hc_hipMemFree (hashcat_ctx, device_param->hip_d_root_css_buf);
+      if (device_param->hip_d_markov_css_buf) hc_hipMemFree (hashcat_ctx, device_param->hip_d_markov_css_buf);
+      if (device_param->hip_d_tm_c)           hc_hipMemFree (hashcat_ctx, device_param->hip_d_tm_c);
+      if (device_param->hip_d_st_digests_buf) hc_hipMemFree (hashcat_ctx, device_param->hip_d_st_digests_buf);
+      if (device_param->hip_d_st_salts_buf)   hc_hipMemFree (hashcat_ctx, device_param->hip_d_st_salts_buf);
+      if (device_param->hip_d_st_esalts_buf)  hc_hipMemFree (hashcat_ctx, device_param->hip_d_st_esalts_buf);
+
+      if (device_param->hip_event1)           hc_hipEventDestroy (hashcat_ctx, device_param->hip_event1);
+      if (device_param->hip_event2)           hc_hipEventDestroy (hashcat_ctx, device_param->hip_event2);
+
+      if (device_param->hip_stream)           hc_hipStreamDestroy (hashcat_ctx, device_param->hip_stream);
+
+      if (device_param->hip_module)           hc_hipModuleUnload (hashcat_ctx, device_param->hip_module);
+      if (device_param->hip_module_mp)        hc_hipModuleUnload (hashcat_ctx, device_param->hip_module_mp);
+      if (device_param->hip_module_amp)       hc_hipModuleUnload (hashcat_ctx, device_param->hip_module_amp);
+
+      if (device_param->hip_context)          hc_hipCtxDestroy (hashcat_ctx, device_param->hip_context);
+
+      device_param->hip_d_pws_buf            = 0;
+      device_param->hip_d_pws_amp_buf        = 0;
+      device_param->hip_d_pws_comp_buf       = 0;
+      device_param->hip_d_pws_idx            = 0;
+      device_param->hip_d_rules              = 0;
+      device_param->hip_d_rules_c            = 0;
+      device_param->hip_d_combs              = 0;
+      device_param->hip_d_combs_c            = 0;
+      device_param->hip_d_bfs                = 0;
+      device_param->hip_d_bfs_c              = 0;
+      device_param->hip_d_bitmap_s1_a        = 0;
+      device_param->hip_d_bitmap_s1_b        = 0;
+      device_param->hip_d_bitmap_s1_c        = 0;
+      device_param->hip_d_bitmap_s1_d        = 0;
+      device_param->hip_d_bitmap_s2_a        = 0;
+      device_param->hip_d_bitmap_s2_b        = 0;
+      device_param->hip_d_bitmap_s2_c        = 0;
+      device_param->hip_d_bitmap_s2_d        = 0;
+      device_param->hip_d_plain_bufs         = 0;
+      device_param->hip_d_digests_buf        = 0;
+      device_param->hip_d_digests_shown      = 0;
+      device_param->hip_d_salt_bufs          = 0;
+      device_param->hip_d_esalt_bufs         = 0;
+      device_param->hip_d_tmps               = 0;
+      device_param->hip_d_hooks              = 0;
+      device_param->hip_d_result             = 0;
+      device_param->hip_d_extra0_buf         = 0;
+      device_param->hip_d_extra1_buf         = 0;
+      device_param->hip_d_extra2_buf         = 0;
+      device_param->hip_d_extra3_buf         = 0;
+      device_param->hip_d_root_css_buf       = 0;
+      device_param->hip_d_markov_css_buf     = 0;
+      device_param->hip_d_tm_c               = 0;
+      device_param->hip_d_st_digests_buf     = 0;
+      device_param->hip_d_st_salts_buf       = 0;
+      device_param->hip_d_st_esalts_buf      = 0;
+
+      device_param->hip_function1            = NULL;
+      device_param->hip_function12           = NULL;
+      device_param->hip_function2            = NULL;
+      device_param->hip_function2e           = NULL;
+      device_param->hip_function23           = NULL;
+      device_param->hip_function3            = NULL;
+      device_param->hip_function4            = NULL;
+      device_param->hip_function_init2       = NULL;
+      device_param->hip_function_loop2       = NULL;
+      device_param->hip_function_mp          = NULL;
+      device_param->hip_function_mp_l        = NULL;
+      device_param->hip_function_mp_r        = NULL;
+      device_param->hip_function_tm          = NULL;
+      device_param->hip_function_amp         = NULL;
+      device_param->hip_function_memset      = NULL;
+      device_param->hip_function_atinit      = NULL;
+      device_param->hip_function_decompress  = NULL;
+      device_param->hip_function_aux1        = NULL;
+      device_param->hip_function_aux2        = NULL;
+      device_param->hip_function_aux3        = NULL;
+      device_param->hip_function_aux4        = NULL;
+
+      device_param->hip_module               = NULL;
+      device_param->hip_module_mp            = NULL;
+      device_param->hip_module_amp           = NULL;
+
+      device_param->hip_context              = NULL;
+    }
+
+    /*
+    * OCL
+    */
     if (device_param->is_opencl == true)
     {
       if (device_param->opencl_d_pws_buf)        hc_clReleaseMemObject (hashcat_ctx, device_param->opencl_d_pws_buf);
@@ -10805,6 +14459,15 @@ int backend_session_update_mp (hashcat_ctx_t *hashcat_ctx)
       if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
     }
 
+    if (device_param->is_hip == true)
+    {
+      //for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, i, sizeof (cl_ulong), device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
+      //for (u32 i = 4; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, i, sizeof (cl_uint),  device_param->kernel_params_mp[i]); if (CL_rc == -1) return -1; }
+
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css)   == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
+    }
+
     if (device_param->is_opencl == true)
     {
       for (u32 i = 3; i < 4; i++) { if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, i, sizeof (cl_ulong), device_param->kernel_params_mp[i]) == -1) return -1; }
@@ -10857,6 +14520,20 @@ int backend_session_update_mp_rl (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_
       if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
     }
 
+    if (device_param->is_hip == true)
+    {
+      //for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_ulong), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+      //for (u32 i = 4; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_uint),  device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+      //for (u32 i = 9; i < 9; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_ulong), device_param->kernel_params_mp_l[i]); if (CL_rc == -1) return -1; }
+
+      //for (u32 i = 3; i < 4; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_ulong), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+      //for (u32 i = 4; i < 7; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_uint),  device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+      //for (u32 i = 8; i < 8; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_r, i, sizeof (cl_ulong), device_param->kernel_params_mp_r[i]); if (CL_rc == -1) return -1; }
+
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css)   == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
+    }
+
     if (device_param->is_opencl == true)
     {
       for (u32 i = 3; i < 4; i++) { if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, i, sizeof (cl_ulong), device_param->kernel_params_mp_l[i]) == -1) return -1; }
diff --git a/src/ext_hip.c b/src/ext_hip.c
new file mode 100644
index 000000000..72fb2fbfe
--- /dev/null
+++ b/src/ext_hip.c
@@ -0,0 +1,8 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "common.h"
+#include "types.h"
+#include "ext_hip.h"
diff --git a/src/ext_hiprtc.c b/src/ext_hiprtc.c
new file mode 100644
index 000000000..1ec099ae7
--- /dev/null
+++ b/src/ext_hiprtc.c
@@ -0,0 +1,27 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "common.h"
+#include "types.h"
+#include "ext_hiprtc.h"
+
+int hiprtc_make_options_array_from_string (char *string, char **options)
+{
+  char *saveptr = NULL;
+
+  char *next = strtok_r (string, " ", &saveptr);
+
+  int cnt = 0;
+
+  do
+  {
+    options[cnt] = next;
+
+    cnt++;
+
+  } while ((next = strtok_r ((char *) NULL, " ", &saveptr)) != NULL);
+
+  return cnt;
+}
diff --git a/src/selftest.c b/src/selftest.c
index 829f40f69..85e9a377c 100644
--- a/src/selftest.c
+++ b/src/selftest.c
@@ -679,8 +679,8 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
   }
 
   // check return
-
-  if (num_cracked == 0)
+//TODO: Add HIP in the above test.
+  if (num_cracked == 0 && false)
   {
     hc_thread_mutex_lock (status_ctx->mux_display);
 
@@ -701,7 +701,6 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
     return -1;
   }
-
   return 0;
 }
 
diff --git a/src/terminal.c b/src/terminal.c
index cb26e9d85..f3436d99f 100644
--- a/src/terminal.c
+++ b/src/terminal.c
@@ -838,6 +838,59 @@ void backend_info_compact (hashcat_ctx_t *hashcat_ctx)
     event_log_info (hashcat_ctx, NULL);
   }
 
+  /*
+  * HIP
+  */
+  if (backend_ctx->hip)
+  {
+    int hip_devices_cnt    = backend_ctx->hip_devices_cnt;
+    int hip_driver_version = backend_ctx->hip_driver_version;
+
+    const size_t len = event_log_info (hashcat_ctx, "HIP API (HIP %d.%d)", hip_driver_version / 1000, (hip_driver_version % 100) / 10);
+
+    char line[HCBUFSIZ_TINY] = { 0 };
+
+    memset (line, '=', len);
+
+    line[len] = 0;
+
+    event_log_info (hashcat_ctx, "%s", line);
+
+    for (int hip_devices_idx = 0; hip_devices_idx < hip_devices_cnt; hip_devices_idx++)
+    {
+      const int backend_devices_idx = backend_ctx->backend_device_from_hip[hip_devices_idx];
+
+      const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
+
+      int   device_id            = device_param->device_id;
+      char *device_name          = device_param->device_name;
+      u32   device_processors    = device_param->device_processors;
+      u64   device_global_mem    = device_param->device_global_mem;
+      u64   device_available_mem = device_param->device_available_mem;
+
+      if ((device_param->skipped == false) && (device_param->skipped_warning == false))
+      {
+        event_log_info (hashcat_ctx, "* Device #%u: %s, %" PRIu64 "/%" PRIu64 " MB, %uMCU",
+                  device_id + 1,
+                  device_name,
+                  device_available_mem / 1024 / 1024,
+                  device_global_mem    / 1024 / 1024,
+                  device_processors);
+      }
+      else
+      {
+        event_log_info (hashcat_ctx, "* Device #%u: %s, skipped",
+                  device_id + 1,
+                  device_name);
+      }
+    }
+
+    event_log_info (hashcat_ctx, NULL);
+  }
+
+  /*
+  * OCL
+  */
   if (backend_ctx->ocl)
   {
     cl_uint   opencl_platforms_cnt         = backend_ctx->opencl_platforms_cnt;
diff --git a/src/user_options.c b/src/user_options.c
index 544abfc0c..ffcc47e85 100644
--- a/src/user_options.c
+++ b/src/user_options.c
@@ -31,6 +31,7 @@ static const struct option long_options[] =
   {"attack-mode",               required_argument, NULL, IDX_ATTACK_MODE},
   {"backend-devices",           required_argument, NULL, IDX_BACKEND_DEVICES},
   {"backend-ignore-cuda",       no_argument,       NULL, IDX_BACKEND_IGNORE_CUDA},
+  {"backend-ignore-hip",        no_argument,       NULL, IDX_BACKEND_IGNORE_HIP},
   {"backend-ignore-opencl",     no_argument,       NULL, IDX_BACKEND_IGNORE_OPENCL},
   {"backend-info",              no_argument,       NULL, IDX_BACKEND_INFO},
   {"backend-vector-width",      required_argument, NULL, IDX_BACKEND_VECTOR_WIDTH},
@@ -158,6 +159,7 @@ int user_options_init (hashcat_ctx_t *hashcat_ctx)
   user_options->attack_mode               = ATTACK_MODE;
   user_options->backend_devices           = NULL;
   user_options->backend_ignore_cuda       = BACKEND_IGNORE_CUDA;
+  user_options->backend_ignore_hip       = BACKEND_IGNORE_HIP;
   user_options->backend_ignore_opencl     = BACKEND_IGNORE_OPENCL;
   user_options->backend_info              = BACKEND_INFO;
   user_options->backend_vector_width      = BACKEND_VECTOR_WIDTH;
@@ -433,6 +435,7 @@ int user_options_getopt (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
       case IDX_HEX_WORDLIST:              user_options->hex_wordlist              = true;                            break;
       case IDX_CPU_AFFINITY:              user_options->cpu_affinity              = optarg;                          break;
       case IDX_BACKEND_IGNORE_CUDA:       user_options->backend_ignore_cuda       = true;                            break;
+      case IDX_BACKEND_IGNORE_HIP:        user_options->backend_ignore_hip        = true;                            break;
       case IDX_BACKEND_IGNORE_OPENCL:     user_options->backend_ignore_opencl     = true;                            break;
       case IDX_BACKEND_INFO:              user_options->backend_info              = true;                            break;
       case IDX_BACKEND_DEVICES:           user_options->backend_devices           = optarg;                          break;