diff --git a/docs/changes.txt b/docs/changes.txt
index de2d26bcd..c2af93956 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -129,6 +129,7 @@
 - Alias Devices: Prevents hashcat, when started with x86_64 emulation on Apple Silicon, from showing the Apple M1 OpenCL CPU as an alias for the Apple M1 Metal GPU
 - Apple Driver: Automatically enable GPU support on Apple OpenCL instead of CPU support
 - Apple Driver: Updated requirements to use Apple OpenCL API to macOS 13.0 - use
+- Backend: Added workaround to get rid of internal runtimes memory leaks
 - Backend: Updated filename chksum format to prevent invalid cache on Apple Silicon when switching arch
 - Backend: Splitting backend_ctx_devices_init into smaller runtime-specific functions
 - Backend Checks: Describe workaround in error message when detecting more than 64 backend devices
@@ -141,6 +142,7 @@
 - Building: Support building windows binaries on macOS using MinGW
 - Dependencies: Updated OpenCL-Headers to v2024.10.24 (commit 265df85)
 - Documents: Updated BUILD.md and added BUILD_macOS.md (containing instructions for building windows binaries on macOS)
+- HIP Backend: Avoid deprecated functions
 - Modules: Added support for non-zero IVs for -m 6800 (Lastpass). Also added `tools/lastpass2hashcat.py`
 - Modules: Updated module_unstable_warning
 - Open Document Format: Added support for small documents with content length < 1024
diff --git a/include/ext_hip.h b/include/ext_hip.h
index d0f53d173..3c0b8433a 100644
--- a/include/ext_hip.h
+++ b/include/ext_hip.h
@@ -12,20 +12,22 @@
 
 // start: driver_types.h
 
-typedef void* hipDeviceptr_t;
+typedef void *hipDeviceptr_t;
+
+typedef enum hipFunction_attribute
+{
+  HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,            // The maximum number of threads per block. Depends on function and device.
+  HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,                // The statically allocated shared memory size in bytes per block required by the function.
+  HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,                 // The user-allocated constant memory by the function in bytes.
+  HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,                 // The local memory usage of each thread by this function in bytes.
+  HIP_FUNC_ATTRIBUTE_NUM_REGS,                         // The number of registers used by each thread of this function.
+  HIP_FUNC_ATTRIBUTE_PTX_VERSION,                      // PTX version
+  HIP_FUNC_ATTRIBUTE_BINARY_VERSION,                   // Binary version
+  HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,                    // Cache mode
+  HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,    // The maximum dynamic shared memory per block for this function in bytes.
+  HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, // The shared memory carveout preference in percent of the maximum shared memory.
+  HIP_FUNC_ATTRIBUTE_MAX
 
-typedef enum hipFunction_attribute {
-    HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,            ///< The maximum number of threads per block. Depends on function and device.
-    HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,                ///< The statically allocated shared memory size in bytes per block required by the function.
-    HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,                 ///< The user-allocated constant memory by the function in bytes.
-    HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,                 ///< The local memory usage of each thread by this function in bytes.
-    HIP_FUNC_ATTRIBUTE_NUM_REGS,                         ///< The number of registers used by each thread of this function.
-    HIP_FUNC_ATTRIBUTE_PTX_VERSION,                      ///< PTX version
-    HIP_FUNC_ATTRIBUTE_BINARY_VERSION,                   ///< Binary version
-    HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,                    ///< Cache mode
-    HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,    ///< The maximum dynamic shared memory per block for this function in bytes.
-    HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, ///< The shared memory carveout preference in percent of the maximum shared memory.
-    HIP_FUNC_ATTRIBUTE_MAX
 } hipFunction_attribute;
 
 // stop: driver_types.h
@@ -47,120 +49,102 @@ typedef struct ihipModuleSymbol_t* hipFunction_t;
 #define __HIP_NODISCARD
 #endif
 
-typedef enum __HIP_NODISCARD hipError_t {
-    hipSuccess = 0,  ///< Successful completion.
-    hipErrorInvalidValue = 1,  ///< One or more of the parameters passed to the API call is NULL
-                               ///< or not in an acceptable range.
-    hipErrorOutOfMemory = 2,
-    // Deprecated
-    hipErrorMemoryAllocation = 2,  ///< Memory allocation error.
-    hipErrorNotInitialized = 3,
-    // Deprecated
-    hipErrorInitializationError = 3,
-    hipErrorDeinitialized = 4,
-    hipErrorProfilerDisabled = 5,
-    hipErrorProfilerNotInitialized = 6,
-    hipErrorProfilerAlreadyStarted = 7,
-    hipErrorProfilerAlreadyStopped = 8,
-    hipErrorInvalidConfiguration = 9,
-    hipErrorInvalidPitchValue = 12,
-    hipErrorInvalidSymbol = 13,
-    hipErrorInvalidDevicePointer = 17,  ///< Invalid Device Pointer
-    hipErrorInvalidMemcpyDirection = 21,  ///< Invalid memory copy direction
-    hipErrorInsufficientDriver = 35,
-    hipErrorMissingConfiguration = 52,
-    hipErrorPriorLaunchFailure = 53,
-    hipErrorInvalidDeviceFunction = 98,
-    hipErrorNoDevice = 100,  ///< Call to hipGetDeviceCount returned 0 devices
-    hipErrorInvalidDevice = 101,  ///< DeviceID must be in range 0...#compute-devices.
-    hipErrorInvalidImage = 200,
-    hipErrorInvalidContext = 201,  ///< Produced when input context is invalid.
-    hipErrorContextAlreadyCurrent = 202,
-    hipErrorMapFailed = 205,
-    // Deprecated
-    hipErrorMapBufferObjectFailed = 205,  ///< Produced when the IPC memory attach failed from ROCr.
-    hipErrorUnmapFailed = 206,
-    hipErrorArrayIsMapped = 207,
-    hipErrorAlreadyMapped = 208,
-    hipErrorNoBinaryForGpu = 209,
-    hipErrorAlreadyAcquired = 210,
-    hipErrorNotMapped = 211,
-    hipErrorNotMappedAsArray = 212,
-    hipErrorNotMappedAsPointer = 213,
-    hipErrorECCNotCorrectable = 214,
-    hipErrorUnsupportedLimit = 215,
-    hipErrorContextAlreadyInUse = 216,
-    hipErrorPeerAccessUnsupported = 217,
-    hipErrorInvalidKernelFile = 218,  ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
-    hipErrorInvalidGraphicsContext = 219,
-    hipErrorInvalidSource = 300,
-    hipErrorFileNotFound = 301,
-    hipErrorSharedObjectSymbolNotFound = 302,
-    hipErrorSharedObjectInitFailed = 303,
-    hipErrorOperatingSystem = 304,
-    hipErrorInvalidHandle = 400,
-    // Deprecated
-    hipErrorInvalidResourceHandle = 400,  ///< Resource handle (hipEvent_t or hipStream_t) invalid.
-    hipErrorIllegalState = 401, ///< Resource required is not in a valid state to perform operation.
-    hipErrorNotFound = 500,
-    hipErrorNotReady = 600,  ///< Indicates that asynchronous operations enqueued earlier are not
-                             ///< ready.  This is not actually an error, but is used to distinguish
-                             ///< from hipSuccess (which indicates completion).  APIs that return
-                             ///< this error include hipEventQuery and hipStreamQuery.
-    hipErrorIllegalAddress = 700,
-    hipErrorLaunchOutOfResources = 701,  ///< Out of resources error.
-    hipErrorLaunchTimeOut = 702,
-    hipErrorPeerAccessAlreadyEnabled =
-        704,  ///< Peer access was already enabled from the current device.
-    hipErrorPeerAccessNotEnabled =
-        705,  ///< Peer access was never enabled from the current device.
-    hipErrorSetOnActiveProcess = 708,
-    hipErrorContextIsDestroyed = 709,
-    hipErrorAssert = 710,  ///< Produced when the kernel calls assert.
-    hipErrorHostMemoryAlreadyRegistered =
-        712,  ///< Produced when trying to lock a page-locked memory.
-    hipErrorHostMemoryNotRegistered =
-        713,  ///< Produced when trying to unlock a non-page-locked memory.
-    hipErrorLaunchFailure =
-        719,  ///< An exception occurred on the device while executing a kernel.
-    hipErrorCooperativeLaunchTooLarge =
-        720,  ///< This error indicates that the number of blocks launched per grid for a kernel
-              ///< that was launched via cooperative launch APIs exceeds the maximum number of
-              ///< allowed blocks for the current device
-    hipErrorNotSupported = 801,  ///< Produced when the hip API is not supported/implemented
-    hipErrorStreamCaptureUnsupported = 900,  ///< The operation is not permitted when the stream
-                                             ///< is capturing.
-    hipErrorStreamCaptureInvalidated = 901,  ///< The current capture sequence on the stream
-                                             ///< has been invalidated due to a previous error.
-    hipErrorStreamCaptureMerge = 902,  ///< The operation would have resulted in a merge of
-                                       ///< two independent capture sequences.
-    hipErrorStreamCaptureUnmatched = 903,  ///< The capture was not initiated in this stream.
-    hipErrorStreamCaptureUnjoined = 904,  ///< The capture sequence contains a fork that was not
-                                          ///< joined to the primary stream.
-    hipErrorStreamCaptureIsolation = 905,  ///< A dependency would have been created which crosses
-                                           ///< the capture sequence boundary. Only implicit
-                                           ///< in-stream ordering dependencies  are allowed
-                                           ///< to cross the boundary
-    hipErrorStreamCaptureImplicit = 906,  ///< The operation would have resulted in a disallowed
-                                          ///< implicit dependency on a current capture sequence
-                                          ///< from hipStreamLegacy.
-    hipErrorCapturedEvent = 907,  ///< The operation is not permitted on an event which was last
-                                  ///< recorded in a capturing stream.
-    hipErrorStreamCaptureWrongThread = 908,  ///< A stream capture sequence not initiated with
-                                             ///< the hipStreamCaptureModeRelaxed argument to
-                                             ///< hipStreamBeginCapture was passed to
-                                             ///< hipStreamEndCapture in a different thread.
-    hipErrorGraphExecUpdateFailure = 910,  ///< This error indicates that the graph update
-                                           ///< not performed because it included changes which
-                                           ///< violated constraints specific to instantiated graph
-                                           ///< update.
-    hipErrorUnknown = 999,  //< Unknown error.
-    // HSA Runtime Error Codes start here.
-    hipErrorRuntimeMemory = 1052,  ///< HSA runtime memory call returned error.  Typically not seen
-                                   ///< in production systems.
-    hipErrorRuntimeOther = 1053,  ///< HSA runtime call other than memory returned error.  Typically
-                                  ///< not seen in production systems.
-    hipErrorTbd  ///< Marker that more error codes are needed.
+typedef enum __HIP_NODISCARD hipError_t
+{
+  hipSuccess = 0,                            // Successful completion.
+  hipErrorInvalidValue = 1,                  // One or more of the parameters passed to the API call is NULL
+                                             // or not in an acceptable range.
+  hipErrorOutOfMemory = 2,
+  // Deprecated
+  hipErrorMemoryAllocation = 2,              // Memory allocation error.
+  hipErrorNotInitialized = 3,
+  // Deprecated
+  hipErrorInitializationError = 3,
+  hipErrorDeinitialized = 4,
+  hipErrorProfilerDisabled = 5,
+  hipErrorProfilerNotInitialized = 6,
+  hipErrorProfilerAlreadyStarted = 7,
+  hipErrorProfilerAlreadyStopped = 8,
+  hipErrorInvalidConfiguration = 9,
+  hipErrorInvalidPitchValue = 12,
+  hipErrorInvalidSymbol = 13,
+  hipErrorInvalidDevicePointer = 17,         // Invalid Device Pointer
+  hipErrorInvalidMemcpyDirection = 21,       // Invalid memory copy direction
+  hipErrorInsufficientDriver = 35,
+  hipErrorMissingConfiguration = 52,
+  hipErrorPriorLaunchFailure = 53,
+  hipErrorInvalidDeviceFunction = 98,
+  hipErrorNoDevice = 100,                    // Call to hipGetDeviceCount returned 0 devices
+  hipErrorInvalidDevice = 101,               // DeviceID must be in range 0...#compute-devices.
+  hipErrorInvalidImage = 200,
+  hipErrorInvalidContext = 201,              // Produced when input context is invalid.
+  hipErrorContextAlreadyCurrent = 202,
+  hipErrorMapFailed = 205,
+  // Deprecated
+  hipErrorMapBufferObjectFailed = 205,       // Produced when the IPC memory attach failed from ROCr.
+  hipErrorUnmapFailed = 206,
+  hipErrorArrayIsMapped = 207,
+  hipErrorAlreadyMapped = 208,
+  hipErrorNoBinaryForGpu = 209,
+  hipErrorAlreadyAcquired = 210,
+  hipErrorNotMapped = 211,
+  hipErrorNotMappedAsArray = 212,
+  hipErrorNotMappedAsPointer = 213,
+  hipErrorECCNotCorrectable = 214,
+  hipErrorUnsupportedLimit = 215,
+  hipErrorContextAlreadyInUse = 216,
+  hipErrorPeerAccessUnsupported = 217,
+  hipErrorInvalidKernelFile = 218,           // In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
+  hipErrorInvalidGraphicsContext = 219,
+  hipErrorInvalidSource = 300,
+  hipErrorFileNotFound = 301,
+  hipErrorSharedObjectSymbolNotFound = 302,
+  hipErrorSharedObjectInitFailed = 303,
+  hipErrorOperatingSystem = 304,
+  hipErrorInvalidHandle = 400,
+  // Deprecated
+  hipErrorInvalidResourceHandle = 400,       // Resource handle (hipEvent_t or hipStream_t) invalid.
+  hipErrorIllegalState = 401,                // Resource required is not in a valid state to perform operation.
+  hipErrorNotFound = 500,
+  hipErrorNotReady = 600,                    // Indicates that asynchronous operations enqueued earlier are not ready.
+                                             // This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion).
+                                             // APIs that return this error include hipEventQuery and hipStreamQuery.
+  hipErrorIllegalAddress = 700,
+  hipErrorLaunchOutOfResources = 701,        // Out of resources error.
+  hipErrorLaunchTimeOut = 702,
+  hipErrorPeerAccessAlreadyEnabled = 704,    // Peer access was already enabled from the current device.
+  hipErrorPeerAccessNotEnabled = 705,        // Peer access was never enabled from the current device.
+  hipErrorSetOnActiveProcess = 708,
+  hipErrorContextIsDestroyed = 709,
+  hipErrorAssert = 710,                      // Produced when the kernel calls assert.
+  hipErrorHostMemoryAlreadyRegistered = 712, // Produced when trying to lock a page-locked memory.
+  hipErrorHostMemoryNotRegistered = 713,     // Produced when trying to unlock a non-page-locked memory.
+  hipErrorLaunchFailure = 719,               // An exception occurred on the device while executing a kernel.
+  hipErrorCooperativeLaunchTooLarge = 720,   // This error indicates that the number of blocks launched per grid for a kernel
+                                             // that was launched via cooperative launch APIs exceeds the maximum number of
+                                             // allowed blocks for the current device
+  hipErrorNotSupported = 801,                // Produced when the hip API is not supported/implemented
+  hipErrorStreamCaptureUnsupported = 900,    // The operation is not permitted when the stream is capturing.
+  hipErrorStreamCaptureInvalidated = 901,    // The current capture sequence on the stream
+                                             // has been invalidated due to a previous error.
+  hipErrorStreamCaptureMerge = 902,          // The operation would have resulted in a merge of two independent capture sequences.
+  hipErrorStreamCaptureUnmatched = 903,      // The capture was not initiated in this stream.
+  hipErrorStreamCaptureUnjoined = 904,       // The capture sequence contains a fork that was not joined to the primary stream.
+  hipErrorStreamCaptureIsolation = 905,      // A dependency would have been created which crosses the capture sequence boundary.
+                                             // Only implicit in-stream ordering dependencies are allowed to cross the boundary
+  hipErrorStreamCaptureImplicit = 906,       // The operation would have resulted in a disallowed implicit dependency on a current
+                                             // capture sequence from hipStreamLegacy.
+  hipErrorCapturedEvent = 907,               // The operation is not permitted on an event which was last recorded in a capturing stream.
+  hipErrorStreamCaptureWrongThread = 908,    // A stream capture sequence not initiated with the hipStreamCaptureModeRelaxed argument to
+                                             // hipStreamBeginCapture was passed to hipStreamEndCapture in a different thread.
+  hipErrorGraphExecUpdateFailure = 910,      // This error indicates that the graph update not performed because it included changes which
+                                             // violated constraints specific to instantiated graph update.
+  hipErrorUnknown = 999,                     // Unknown error.
+  // HSA Runtime Error Codes start here.
+  hipErrorRuntimeMemory = 1052,              // HSA runtime memory call returned error. Typically not seen in production systems.
+  hipErrorRuntimeOther = 1053,               // HSA runtime call other than memory returned error. Typically not seen in production systems.
+  hipErrorTbd                                // Marker that more error codes are needed.
+
 } hipError_t;
 
 #undef __HIP_NODISCARD
@@ -170,178 +154,178 @@ typedef enum __HIP_NODISCARD hipError_t {
  * hipDeviceAttribute_t
  * hipDeviceAttributeUnused number: 5
  */
-typedef enum hipDeviceAttribute_t {
-    hipDeviceAttributeCudaCompatibleBegin = 0,
+typedef enum hipDeviceAttribute_t
+{
+  hipDeviceAttributeCudaCompatibleBegin = 0,
 
-    hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, ///< Whether ECC support is enabled.
-    hipDeviceAttributeAccessPolicyMaxWindowSize,        ///< Cuda only. The maximum size of the window policy in bytes.
-    hipDeviceAttributeAsyncEngineCount,                 ///< Asynchronous engines number.
-    hipDeviceAttributeCanMapHostMemory,                 ///< Whether host memory can be mapped into device address space
-    hipDeviceAttributeCanUseHostPointerForRegisteredMem,///< Device can access host registered memory
-                                                        ///< at the same virtual address as the CPU
-    hipDeviceAttributeClockRate,                        ///< Peak clock frequency in kilohertz.
-    hipDeviceAttributeComputeMode,                      ///< Compute mode that device is currently in.
-    hipDeviceAttributeComputePreemptionSupported,       ///< Device supports Compute Preemption.
-    hipDeviceAttributeConcurrentKernels,                ///< Device can possibly execute multiple kernels concurrently.
-    hipDeviceAttributeConcurrentManagedAccess,          ///< Device can coherently access managed memory concurrently with the CPU
-    hipDeviceAttributeCooperativeLaunch,                ///< Support cooperative launch
-    hipDeviceAttributeCooperativeMultiDeviceLaunch,     ///< Support cooperative launch on multiple devices
-    hipDeviceAttributeDeviceOverlap,                    ///< Device can concurrently copy memory and execute a kernel.
-                                                        ///< Deprecated. Use instead asyncEngineCount.
-    hipDeviceAttributeDirectManagedMemAccessFromHost,   ///< Host can directly access managed memory on
-                                                        ///< the device without migration
-    hipDeviceAttributeGlobalL1CacheSupported,           ///< Device supports caching globals in L1
-    hipDeviceAttributeHostNativeAtomicSupported,        ///< Link between the device and the host supports native atomic operations
-    hipDeviceAttributeIntegrated,                       ///< Device is integrated GPU
-    hipDeviceAttributeIsMultiGpuBoard,                  ///< Multiple GPU devices.
-    hipDeviceAttributeKernelExecTimeout,                ///< Run time limit for kernels executed on the device
-    hipDeviceAttributeL2CacheSize,                      ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
-    hipDeviceAttributeLocalL1CacheSupported,            ///< caching locals in L1 is supported
-    hipDeviceAttributeLuid,                             ///< 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms
-    hipDeviceAttributeLuidDeviceNodeMask,               ///< Luid device node mask. Undefined on TCC and non-Windows platforms
-    hipDeviceAttributeComputeCapabilityMajor,           ///< Major compute capability version number.
-    hipDeviceAttributeManagedMemory,                    ///< Device supports allocating managed memory on this system
-    hipDeviceAttributeMaxBlocksPerMultiProcessor,       ///< Max block size per multiprocessor
-    hipDeviceAttributeMaxBlockDimX,                     ///< Max block size in width.
-    hipDeviceAttributeMaxBlockDimY,                     ///< Max block size in height.
-    hipDeviceAttributeMaxBlockDimZ,                     ///< Max block size in depth.
-    hipDeviceAttributeMaxGridDimX,                      ///< Max grid size  in width.
-    hipDeviceAttributeMaxGridDimY,                      ///< Max grid size  in height.
-    hipDeviceAttributeMaxGridDimZ,                      ///< Max grid size  in depth.
-    hipDeviceAttributeMaxSurface1D,                     ///< Maximum size of 1D surface.
-    hipDeviceAttributeMaxSurface1DLayered,              ///< Cuda only. Maximum dimensions of 1D layered surface.
-    hipDeviceAttributeMaxSurface2D,                     ///< Maximum dimension (width, height) of 2D surface.
-    hipDeviceAttributeMaxSurface2DLayered,              ///< Cuda only. Maximum dimensions of 2D layered surface.
-    hipDeviceAttributeMaxSurface3D,                     ///< Maximum dimension (width, height, depth) of 3D surface.
-    hipDeviceAttributeMaxSurfaceCubemap,                ///< Cuda only. Maximum dimensions of Cubemap surface.
-    hipDeviceAttributeMaxSurfaceCubemapLayered,         ///< Cuda only. Maximum dimension of Cubemap layered surface.
-    hipDeviceAttributeMaxTexture1DWidth,                ///< Maximum size of 1D texture.
-    hipDeviceAttributeMaxTexture1DLayered,              ///< Maximum dimensions of 1D layered texture.
-    hipDeviceAttributeMaxTexture1DLinear,               ///< Maximum number of elements allocatable in a 1D linear texture.
-                                                        ///< Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda.
-    hipDeviceAttributeMaxTexture1DMipmap,               ///< Maximum size of 1D mipmapped texture.
-    hipDeviceAttributeMaxTexture2DWidth,                ///< Maximum dimension width of 2D texture.
-    hipDeviceAttributeMaxTexture2DHeight,               ///< Maximum dimension hight of 2D texture.
-    hipDeviceAttributeMaxTexture2DGather,               ///< Maximum dimensions of 2D texture if gather operations  performed.
-    hipDeviceAttributeMaxTexture2DLayered,              ///< Maximum dimensions of 2D layered texture.
-    hipDeviceAttributeMaxTexture2DLinear,               ///< Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory.
-    hipDeviceAttributeMaxTexture2DMipmap,               ///< Maximum dimensions of 2D mipmapped texture.
-    hipDeviceAttributeMaxTexture3DWidth,                ///< Maximum dimension width of 3D texture.
-    hipDeviceAttributeMaxTexture3DHeight,               ///< Maximum dimension height of 3D texture.
-    hipDeviceAttributeMaxTexture3DDepth,                ///< Maximum dimension depth of 3D texture.
-    hipDeviceAttributeMaxTexture3DAlt,                  ///< Maximum dimensions of alternate 3D texture.
-    hipDeviceAttributeMaxTextureCubemap,                ///< Maximum dimensions of Cubemap texture
-    hipDeviceAttributeMaxTextureCubemapLayered,         ///< Maximum dimensions of Cubemap layered texture.
-    hipDeviceAttributeMaxThreadsDim,                    ///< Maximum dimension of a block
-    hipDeviceAttributeMaxThreadsPerBlock,               ///< Maximum number of threads per block.
-    hipDeviceAttributeMaxThreadsPerMultiProcessor,      ///< Maximum resident threads per multiprocessor.
-    hipDeviceAttributeMaxPitch,                         ///< Maximum pitch in bytes allowed by memory copies
-    hipDeviceAttributeMemoryBusWidth,                   ///< Global memory bus width in bits.
-    hipDeviceAttributeMemoryClockRate,                  ///< Peak memory clock frequency in kilohertz.
-    hipDeviceAttributeComputeCapabilityMinor,           ///< Minor compute capability version number.
-    hipDeviceAttributeMultiGpuBoardGroupID,             ///< Unique ID of device group on the same multi-GPU board
-    hipDeviceAttributeMultiprocessorCount,              ///< Number of multiprocessors on the device.
-    hipDeviceAttributeUnused1,                          ///< Previously hipDeviceAttributeName
-    hipDeviceAttributePageableMemoryAccess,             ///< Device supports coherently accessing pageable memory
-                                                        ///< without calling hipHostRegister on it
-    hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via the host's page tables
-    hipDeviceAttributePciBusId,                         ///< PCI Bus ID.
-    hipDeviceAttributePciDeviceId,                      ///< PCI Device ID.
-    hipDeviceAttributePciDomainID,                      ///< PCI Domain ID.
-    hipDeviceAttributePersistingL2CacheMaxSize,         ///< Maximum l2 persisting lines capacity in bytes
-    hipDeviceAttributeMaxRegistersPerBlock,             ///< 32-bit registers available to a thread block. This number is shared
-                                                        ///< by all thread blocks simultaneously resident on a multiprocessor.
-    hipDeviceAttributeMaxRegistersPerMultiprocessor,    ///< 32-bit registers available per block.
-    hipDeviceAttributeReservedSharedMemPerBlock,        ///< Shared memory reserved by CUDA driver per block.
-    hipDeviceAttributeMaxSharedMemoryPerBlock,          ///< Maximum shared memory available per block in bytes.
-    hipDeviceAttributeSharedMemPerBlockOptin,           ///< Maximum shared memory per block usable by special opt in.
-    hipDeviceAttributeSharedMemPerMultiprocessor,       ///< Shared memory available per multiprocessor.
-    hipDeviceAttributeSingleToDoublePrecisionPerfRatio, ///< Cuda only. Performance ratio of single precision to double precision.
-    hipDeviceAttributeStreamPrioritiesSupported,        ///< Whether to support stream priorities.
-    hipDeviceAttributeSurfaceAlignment,                 ///< Alignment requirement for surfaces
-    hipDeviceAttributeTccDriver,                        ///< Cuda only. Whether device is a Tesla device using TCC driver
-    hipDeviceAttributeTextureAlignment,                 ///< Alignment requirement for textures
-    hipDeviceAttributeTexturePitchAlignment,            ///< Pitch alignment requirement for 2D texture references bound to pitched memory;
-    hipDeviceAttributeTotalConstantMemory,              ///< Constant memory size in bytes.
-    hipDeviceAttributeTotalGlobalMem,                   ///< Global memory available on devicice.
-    hipDeviceAttributeUnifiedAddressing,                ///< Cuda only. An unified address space shared with the host.
-    hipDeviceAttributeUnused2,                          ///< Previously hipDeviceAttributeUuid
-    hipDeviceAttributeWarpSize,                         ///< Warp size in threads.
-    hipDeviceAttributeMemoryPoolsSupported,             ///< Device supports HIP Stream Ordered Memory Allocator
-    hipDeviceAttributeVirtualMemoryManagementSupported, ///< Device supports HIP virtual memory management
-    hipDeviceAttributeHostRegisterSupported,            ///< Can device support host memory registration via hipHostRegister
-    hipDeviceAttributeMemoryPoolSupportedHandleTypes,   ///< Supported handle mask for HIP Stream Ordered Memory Allocator
+  hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, // Whether ECC support is enabled.
+  hipDeviceAttributeAccessPolicyMaxWindowSize,                // Cuda only. The maximum size of the window policy in bytes.
+  hipDeviceAttributeAsyncEngineCount,                         // Asynchronous engines number.
+  hipDeviceAttributeCanMapHostMemory,                         // Whether host memory can be mapped into device address space
+  hipDeviceAttributeCanUseHostPointerForRegisteredMem,        // Device can access host registered memory
+                                                              // at the same virtual address as the CPU
+  hipDeviceAttributeClockRate,                                // Peak clock frequency in kilohertz.
+  hipDeviceAttributeComputeMode,                              // Compute mode that device is currently in.
+  hipDeviceAttributeComputePreemptionSupported,               // Device supports Compute Preemption.
+  hipDeviceAttributeConcurrentKernels,                        // Device can possibly execute multiple kernels concurrently.
+  hipDeviceAttributeConcurrentManagedAccess,                  // Device can coherently access managed memory concurrently with the CPU
+  hipDeviceAttributeCooperativeLaunch,                        // Support cooperative launch
+  hipDeviceAttributeCooperativeMultiDeviceLaunch,             // Support cooperative launch on multiple devices
+  hipDeviceAttributeDeviceOverlap,                            // Device can concurrently copy memory and execute a kernel.
+                                                              // Deprecated. Use instead asyncEngineCount.
+  hipDeviceAttributeDirectManagedMemAccessFromHost,           // Host can directly access managed memory on
+                                                              // the device without migration
+  hipDeviceAttributeGlobalL1CacheSupported,                   // Device supports caching globals in L1
+  hipDeviceAttributeHostNativeAtomicSupported,                // Link between the device and the host supports native atomic operations
+  hipDeviceAttributeIntegrated,                               // Device is integrated GPU
+  hipDeviceAttributeIsMultiGpuBoard,                          // Multiple GPU devices.
+  hipDeviceAttributeKernelExecTimeout,                        // Run time limit for kernels executed on the device
+  hipDeviceAttributeL2CacheSize,                              // Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
+  hipDeviceAttributeLocalL1CacheSupported,                    // caching locals in L1 is supported
+  hipDeviceAttributeLuid,                                     // 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms
+  hipDeviceAttributeLuidDeviceNodeMask,                       // Luid device node mask. Undefined on TCC and non-Windows platforms
+  hipDeviceAttributeComputeCapabilityMajor,                   // Major compute capability version number.
+  hipDeviceAttributeManagedMemory,                            // Device supports allocating managed memory on this system
+  hipDeviceAttributeMaxBlocksPerMultiProcessor,               // Max block size per multiprocessor
+  hipDeviceAttributeMaxBlockDimX,                             // Max block size in width.
+  hipDeviceAttributeMaxBlockDimY,                             // Max block size in height.
+  hipDeviceAttributeMaxBlockDimZ,                             // Max block size in depth.
+  hipDeviceAttributeMaxGridDimX,                              // Max grid size  in width.
+  hipDeviceAttributeMaxGridDimY,                              // Max grid size  in height.
+  hipDeviceAttributeMaxGridDimZ,                              // Max grid size  in depth.
+  hipDeviceAttributeMaxSurface1D,                             // Maximum size of 1D surface.
+  hipDeviceAttributeMaxSurface1DLayered,                      // Cuda only. Maximum dimensions of 1D layered surface.
+  hipDeviceAttributeMaxSurface2D,                             // Maximum dimension (width, height) of 2D surface.
+  hipDeviceAttributeMaxSurface2DLayered,                      // Cuda only. Maximum dimensions of 2D layered surface.
+  hipDeviceAttributeMaxSurface3D,                             // Maximum dimension (width, height, depth) of 3D surface.
+  hipDeviceAttributeMaxSurfaceCubemap,                        // Cuda only. Maximum dimensions of Cubemap surface.
+  hipDeviceAttributeMaxSurfaceCubemapLayered,                 // Cuda only. Maximum dimension of Cubemap layered surface.
+  hipDeviceAttributeMaxTexture1DWidth,                        // Maximum size of 1D texture.
+  hipDeviceAttributeMaxTexture1DLayered,                      // Maximum dimensions of 1D layered texture.
+  hipDeviceAttributeMaxTexture1DLinear,                       // Maximum number of elements allocatable in a 1D linear texture.
+                                                              // Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda.
+  hipDeviceAttributeMaxTexture1DMipmap,                       // Maximum size of 1D mipmapped texture.
+  hipDeviceAttributeMaxTexture2DWidth,                        // Maximum dimension width of 2D texture.
+  hipDeviceAttributeMaxTexture2DHeight,                       // Maximum dimension hight of 2D texture.
+  hipDeviceAttributeMaxTexture2DGather,                       // Maximum dimensions of 2D texture if gather operations  performed.
+  hipDeviceAttributeMaxTexture2DLayered,                      // Maximum dimensions of 2D layered texture.
+  hipDeviceAttributeMaxTexture2DLinear,                       // Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory.
+  hipDeviceAttributeMaxTexture2DMipmap,                       // Maximum dimensions of 2D mipmapped texture.
+  hipDeviceAttributeMaxTexture3DWidth,                        // Maximum dimension width of 3D texture.
+  hipDeviceAttributeMaxTexture3DHeight,                       // Maximum dimension height of 3D texture.
+  hipDeviceAttributeMaxTexture3DDepth,                        // Maximum dimension depth of 3D texture.
+  hipDeviceAttributeMaxTexture3DAlt,                          // Maximum dimensions of alternate 3D texture.
+  hipDeviceAttributeMaxTextureCubemap,                        // Maximum dimensions of Cubemap texture
+  hipDeviceAttributeMaxTextureCubemapLayered,                 // Maximum dimensions of Cubemap layered texture.
+  hipDeviceAttributeMaxThreadsDim,                            // Maximum dimension of a block
+  hipDeviceAttributeMaxThreadsPerBlock,                       // Maximum number of threads per block.
+  hipDeviceAttributeMaxThreadsPerMultiProcessor,              // Maximum resident threads per multiprocessor.
+  hipDeviceAttributeMaxPitch,                                 // Maximum pitch in bytes allowed by memory copies
+  hipDeviceAttributeMemoryBusWidth,                           // Global memory bus width in bits.
+  hipDeviceAttributeMemoryClockRate,                          // Peak memory clock frequency in kilohertz.
+  hipDeviceAttributeComputeCapabilityMinor,                   // Minor compute capability version number.
+  hipDeviceAttributeMultiGpuBoardGroupID,                     // Unique ID of device group on the same multi-GPU board
+  hipDeviceAttributeMultiprocessorCount,                      // Number of multiprocessors on the device.
+  hipDeviceAttributeUnused1,                                  // Previously hipDeviceAttributeName
+  hipDeviceAttributePageableMemoryAccess,                     // Device supports coherently accessing pageable memory
+                                                              // without calling hipHostRegister on it
+  hipDeviceAttributePageableMemoryAccessUsesHostPageTables,   // Device accesses pageable memory via the host's page tables
+  hipDeviceAttributePciBusId,                                 // PCI Bus ID.
+  hipDeviceAttributePciDeviceId,                              // PCI Device ID.
+  hipDeviceAttributePciDomainID,                              // PCI Domain ID.
+  hipDeviceAttributePersistingL2CacheMaxSize,                 // Maximum l2 persisting lines capacity in bytes
+  hipDeviceAttributeMaxRegistersPerBlock,                     // 32-bit registers available to a thread block. This number is shared
+                                                              // by all thread blocks simultaneously resident on a multiprocessor.
+  hipDeviceAttributeMaxRegistersPerMultiprocessor,            // 32-bit registers available per block.
+  hipDeviceAttributeReservedSharedMemPerBlock,                // Shared memory reserved by CUDA driver per block.
+  hipDeviceAttributeMaxSharedMemoryPerBlock,                  // Maximum shared memory available per block in bytes.
+  hipDeviceAttributeSharedMemPerBlockOptin,                   // Maximum shared memory per block usable by special opt in.
+  hipDeviceAttributeSharedMemPerMultiprocessor,               // Shared memory available per multiprocessor.
+  hipDeviceAttributeSingleToDoublePrecisionPerfRatio,         // Cuda only. Performance ratio of single precision to double precision.
+  hipDeviceAttributeStreamPrioritiesSupported,                // Whether to support stream priorities.
+  hipDeviceAttributeSurfaceAlignment,                         // Alignment requirement for surfaces
+  hipDeviceAttributeTccDriver,                                // Cuda only. Whether device is a Tesla device using TCC driver
+  hipDeviceAttributeTextureAlignment,                         // Alignment requirement for textures
+  hipDeviceAttributeTexturePitchAlignment,                    // Pitch alignment requirement for 2D texture references bound to pitched memory;
+  hipDeviceAttributeTotalConstantMemory,                      // Constant memory size in bytes.
+  hipDeviceAttributeTotalGlobalMem,                           // Global memory available on devicice.
+  hipDeviceAttributeUnifiedAddressing,                        // Cuda only. An unified address space shared with the host.
+  hipDeviceAttributeUnused2,                                  // Previously hipDeviceAttributeUuid
+  hipDeviceAttributeWarpSize,                                 // Warp size in threads.
+  hipDeviceAttributeMemoryPoolsSupported,                     // Device supports HIP Stream Ordered Memory Allocator
+  hipDeviceAttributeVirtualMemoryManagementSupported,         // Device supports HIP virtual memory management
+  hipDeviceAttributeHostRegisterSupported,                    // Can device support host memory registration via hipHostRegister
+  hipDeviceAttributeMemoryPoolSupportedHandleTypes,           // Supported handle mask for HIP Stream Ordered Memory Allocator
 
-    hipDeviceAttributeCudaCompatibleEnd = 9999,
-    hipDeviceAttributeAmdSpecificBegin = 10000,
+  hipDeviceAttributeCudaCompatibleEnd = 9999,
+  hipDeviceAttributeAmdSpecificBegin = 10000,
 
-    hipDeviceAttributeClockInstructionRate = hipDeviceAttributeAmdSpecificBegin,  ///< Frequency in khz of the timer used by the device-side "clock*"
-    hipDeviceAttributeUnused3,                                  ///< Previously hipDeviceAttributeArch
-    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,         ///< Maximum Shared Memory PerMultiprocessor.
-    hipDeviceAttributeUnused4,                                  ///< Previously hipDeviceAttributeGcnArch
-    hipDeviceAttributeUnused5,                                  ///< Previously hipDeviceAttributeGcnArchName
-    hipDeviceAttributeHdpMemFlushCntl,                          ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
-    hipDeviceAttributeHdpRegFlushCntl,                          ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,      ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched functions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,   ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched grid dimensions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,  ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched block dimensions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched shared memories
-    hipDeviceAttributeIsLargeBar,                               ///< Whether it is LargeBar
-    hipDeviceAttributeAsicRevision,                             ///< Revision of the GPU in this device
-    hipDeviceAttributeCanUseStreamWaitValue,                    ///< '1' if Device supports hipStreamWaitValue32() and
-                                                                ///< hipStreamWaitValue64(), '0' otherwise.
-    hipDeviceAttributeImageSupport,                             ///< '1' if Device supports image, '0' otherwise.
-    hipDeviceAttributePhysicalMultiProcessorCount,              ///< All available physical compute
-                                                                ///< units for the device
-    hipDeviceAttributeFineGrainSupport,                         ///< '1' if Device supports fine grain, '0' otherwise
-    hipDeviceAttributeWallClockRate,                            ///< Constant frequency of wall clock in kilohertz.
+  hipDeviceAttributeClockInstructionRate = hipDeviceAttributeAmdSpecificBegin,  // Frequency in khz of the timer used by the device-side "clock*"
+  hipDeviceAttributeUnused3,                                  // Previously hipDeviceAttributeArch
+  hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,         // Maximum Shared Memory PerMultiprocessor.
+  hipDeviceAttributeUnused4,                                  // Previously hipDeviceAttributeGcnArch
+  hipDeviceAttributeUnused5,                                  // Previously hipDeviceAttributeGcnArchName
+  hipDeviceAttributeHdpMemFlushCntl,                          // Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
+  hipDeviceAttributeHdpRegFlushCntl,                          // Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,      // Supports cooperative launch on multiple devices with unmatched functions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,   // Supports cooperative launch on multiple devices with unmatched grid dimensions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,  // Supports cooperative launch on multiple devices with unmatched block dimensions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, // Supports cooperative launch on multiple devices with unmatched shared memories
+  hipDeviceAttributeIsLargeBar,                               // Whether it is LargeBar
+  hipDeviceAttributeAsicRevision,                             // Revision of the GPU in this device
+  hipDeviceAttributeCanUseStreamWaitValue,                    // '1' if Device supports hipStreamWaitValue32() and hipStreamWaitValue64(), '0' otherwise.
+  hipDeviceAttributeImageSupport,                             // '1' if Device supports image, '0' otherwise.
+  hipDeviceAttributePhysicalMultiProcessorCount,              // All available physical compute units for the device
+  hipDeviceAttributeFineGrainSupport,                         // '1' if Device supports fine grain, '0' otherwise
+  hipDeviceAttributeWallClockRate,                            // Constant frequency of wall clock in kilohertz.
+
+  hipDeviceAttributeAmdSpecificEnd = 19999,
+  hipDeviceAttributeVendorSpecificBegin = 20000,
+  // Extended attributes for vendors
 
-    hipDeviceAttributeAmdSpecificEnd = 19999,
-    hipDeviceAttributeVendorSpecificBegin = 20000,
-    // Extended attributes for vendors
 } hipDeviceAttribute_t;
 
 /**
  * hipDeviceArch_t
  *
  */
-typedef struct {
-    // 32-bit Atomics
-    unsigned hasGlobalInt32Atomics : 1;     ///< 32-bit integer atomics for global memory.
-    unsigned hasGlobalFloatAtomicExch : 1;  ///< 32-bit float atomic exch for global memory.
-    unsigned hasSharedInt32Atomics : 1;     ///< 32-bit integer atomics for shared memory.
-    unsigned hasSharedFloatAtomicExch : 1;  ///< 32-bit float atomic exch for shared memory.
-    unsigned hasFloatAtomicAdd : 1;  ///< 32-bit float atomic add in global and shared memory.
+typedef struct
+{
+  // 32-bit Atomics
+  unsigned hasGlobalInt32Atomics : 1;     // 32-bit integer atomics for global memory.
+  unsigned hasGlobalFloatAtomicExch : 1;  // 32-bit float atomic exch for global memory.
+  unsigned hasSharedInt32Atomics : 1;     // 32-bit integer atomics for shared memory.
+  unsigned hasSharedFloatAtomicExch : 1;  // 32-bit float atomic exch for shared memory.
+  unsigned hasFloatAtomicAdd : 1;         // 32-bit float atomic add in global and shared memory.
 
-    // 64-bit Atomics
-    unsigned hasGlobalInt64Atomics : 1;  ///< 64-bit integer atomics for global memory.
-    unsigned hasSharedInt64Atomics : 1;  ///< 64-bit integer atomics for shared memory.
+  // 64-bit Atomics
+  unsigned hasGlobalInt64Atomics : 1;     // 64-bit integer atomics for global memory.
+  unsigned hasSharedInt64Atomics : 1;     // 64-bit integer atomics for shared memory.
 
-    // Doubles
-    unsigned hasDoubles : 1;  ///< Double-precision floating point.
+  // Doubles
+  unsigned hasDoubles : 1;                // Double-precision floating point.
 
-    // Warp cross-lane operations
-    unsigned hasWarpVote : 1;     ///< Warp vote instructions (__any, __all).
-    unsigned hasWarpBallot : 1;   ///< Warp ballot instructions (__ballot).
-    unsigned hasWarpShuffle : 1;  ///< Warp shuffle operations. (__shfl_*).
-    unsigned hasFunnelShift : 1;  ///< Funnel two words into one with shift&mask caps.
+  // Warp cross-lane operations
+  unsigned hasWarpVote : 1;               // Warp vote instructions (__any, __all).
+  unsigned hasWarpBallot : 1;             // Warp ballot instructions (__ballot).
+  unsigned hasWarpShuffle : 1;            // Warp shuffle operations. (__shfl_*).
+  unsigned hasFunnelShift : 1;            // Funnel two words into one with shift&mask caps.
 
-    // Sync
-    unsigned hasThreadFenceSystem : 1;  ///< __threadfence_system.
-    unsigned hasSyncThreadsExt : 1;     ///< __syncthreads_count, syncthreads_and, syncthreads_or.
+  // Sync
+  unsigned hasThreadFenceSystem : 1;      // __threadfence_system.
+  unsigned hasSyncThreadsExt : 1;         // __syncthreads_count, syncthreads_and, syncthreads_or.
+
+  // Misc
+  unsigned hasSurfaceFuncs : 1;           // Surface functions.
+  unsigned has3dGrid : 1;                 // Grid and group dims are 3D (rather than 2D).
+  unsigned hasDynamicParallelism : 1;     // Dynamic parallelism.
 
-    // Misc
-    unsigned hasSurfaceFuncs : 1;        ///< Surface functions.
-    unsigned has3dGrid : 1;              ///< Grid and group dims are 3D (rather than 2D).
-    unsigned hasDynamicParallelism : 1;  ///< Dynamic parallelism.
 } hipDeviceArch_t;
 
-typedef struct hipUUID_t {
-    char bytes[16];
+typedef struct hipUUID_t
+{
+  char bytes[16];
+
 } hipUUID;
 
 
@@ -349,149 +333,127 @@ typedef struct hipUUID_t {
  * hipDeviceProp
  *
  */
-typedef struct hipDeviceProp_t {
-    char name[256];                   ///< Device name.
-    hipUUID uuid;                     ///< UUID of a device
-    char luid[8];                     ///< 8-byte unique identifier. Only valid on windows
-    unsigned int luidDeviceNodeMask;  ///< LUID node mask
-    size_t totalGlobalMem;            ///< Size of global memory region (in bytes).
-    size_t sharedMemPerBlock;         ///< Size of shared memory per block (in bytes).
-    int regsPerBlock;                 ///< Registers per block.
-    int warpSize;                     ///< Warp size.
-    size_t memPitch;                  ///< Maximum pitch in bytes allowed by memory copies
-                                      ///< pitched memory
-    int maxThreadsPerBlock;           ///< Max work items per work group or workgroup max size.
-    int maxThreadsDim[3];             ///< Max number of threads in each dimension (XYZ) of a block.
-    int maxGridSize[3];               ///< Max grid dimensions (XYZ).
-    int clockRate;                    ///< Max clock frequency of the multiProcessors in khz.
-    size_t totalConstMem;             ///< Size of shared constant memory region on the device
-                                      ///< (in bytes).
-    int major;  ///< Major compute capability.  On HCC, this is an approximation and features may
-                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
-                ///< feature caps.
-    int minor;  ///< Minor compute capability.  On HCC, this is an approximation and features may
-                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
-                ///< feature caps.
-    size_t textureAlignment;       ///< Alignment requirement for textures
-    size_t texturePitchAlignment;  ///< Pitch alignment requirement for texture references bound to
-    int deviceOverlap;             ///< Deprecated. Use asyncEngineCount instead
-    int multiProcessorCount;       ///< Number of multi-processors (compute units).
-    int kernelExecTimeoutEnabled;  ///< Run time limit for kernels executed on the device
-    int integrated;                ///< APU vs dGPU
-    int canMapHostMemory;          ///< Check whether HIP can map host memory
-    int computeMode;               ///< Compute mode.
-    int maxTexture1D;              ///< Maximum number of elements in 1D images
-    int maxTexture1DMipmap;        ///< Maximum 1D mipmap texture size
-    int maxTexture1DLinear;        ///< Maximum size for 1D textures bound to linear memory
-    int maxTexture2D[2];  ///< Maximum dimensions (width, height) of 2D images, in image elements
-    int maxTexture2DMipmap[2];  ///< Maximum number of elements in 2D array mipmap of images
-    int maxTexture2DLinear[3];  ///< Maximum 2D tex dimensions if tex are bound to pitched memory
-    int maxTexture2DGather[2];  ///< Maximum 2D tex dimensions if gather has to be performed
-    int maxTexture3D[3];  ///< Maximum dimensions (width, height, depth) of 3D images, in image
-                          ///< elements
-    int maxTexture3DAlt[3];           ///< Maximum alternate 3D texture dims
-    int maxTextureCubemap;            ///< Maximum cubemap texture dims
-    int maxTexture1DLayered[2];       ///< Maximum number of elements in 1D array images
-    int maxTexture2DLayered[3];       ///< Maximum number of elements in 2D array images
-    int maxTextureCubemapLayered[2];  ///< Maximum cubemaps layered texture dims
-    int maxSurface1D;                 ///< Maximum 1D surface size
-    int maxSurface2D[2];              ///< Maximum 2D surface size
-    int maxSurface3D[3];              ///< Maximum 3D surface size
-    int maxSurface1DLayered[2];       ///< Maximum 1D layered surface size
-    int maxSurface2DLayered[3];       ///< Maximum 2D layared surface size
-    int maxSurfaceCubemap;            ///< Maximum cubemap surface size
-    int maxSurfaceCubemapLayered[2];  ///< Maximum cubemap layered surface size
-    size_t surfaceAlignment;          ///< Alignment requirement for surface
-    int concurrentKernels;         ///< Device can possibly execute multiple kernels concurrently.
-    int ECCEnabled;                ///< Device has ECC support enabled
-    int pciBusID;                  ///< PCI Bus ID.
-    int pciDeviceID;               ///< PCI Device ID.
-    int pciDomainID;               ///< PCI Domain ID
-    int tccDriver;                 ///< 1:If device is Tesla device using TCC driver, else 0
-    int asyncEngineCount;          ///< Number of async engines
-    int unifiedAddressing;         ///< Does device and host share unified address space
-    int memoryClockRate;           ///< Max global memory clock frequency in khz.
-    int memoryBusWidth;            ///< Global memory bus width in bits.
-    int l2CacheSize;               ///< L2 cache size.
-    int persistingL2CacheMaxSize;  ///< Device's max L2 persisting lines in bytes
-    int maxThreadsPerMultiProcessor;    ///< Maximum resident threads per multi-processor.
-    int streamPrioritiesSupported;      ///< Device supports stream priority
-    int globalL1CacheSupported;         ///< Indicates globals are cached in L1
-    int localL1CacheSupported;          ///< Locals are cahced in L1
-    size_t sharedMemPerMultiprocessor;  ///< Amount of shared memory available per multiprocessor.
-    int regsPerMultiprocessor;          ///< registers available per multiprocessor
-    int managedMemory;         ///< Device supports allocating managed memory on this system
-    int isMultiGpuBoard;       ///< 1 if device is on a multi-GPU board, 0 if not.
-    int multiGpuBoardGroupID;  ///< Unique identifier for a group of devices on same multiboard GPU
-    int hostNativeAtomicSupported;         ///< Link between host and device supports native atomics
-    int singleToDoublePrecisionPerfRatio;  ///< Deprecated. CUDA only.
-    int pageableMemoryAccess;              ///< Device supports coherently accessing pageable memory
-                                           ///< without calling hipHostRegister on it
-    int concurrentManagedAccess;  ///< Device can coherently access managed memory concurrently with
-                                  ///< the CPU
-    int computePreemptionSupported;         ///< Is compute preemption supported on the device
-    int canUseHostPointerForRegisteredMem;  ///< Device can access host registered memory with same
-                                            ///< address as the host
-    int cooperativeLaunch;                  ///< HIP device supports cooperative launch
-    int cooperativeMultiDeviceLaunch;       ///< HIP device supports cooperative launch on multiple
-                                            ///< devices
-    size_t
-        sharedMemPerBlockOptin;  ///< Per device m ax shared mem per block usable by special opt in
-    int pageableMemoryAccessUsesHostPageTables;  ///< Device accesses pageable memory via the host's
-                                                 ///< page tables
-    int directManagedMemAccessFromHost;  ///< Host can directly access managed memory on the device
-                                         ///< without migration
-    int maxBlocksPerMultiProcessor;      ///< Max number of blocks on CU
-    int accessPolicyMaxWindowSize;       ///< Max value of access policy window
-    size_t reservedSharedMemPerBlock;    ///< Shared memory reserved by driver per block
-    int hostRegisterSupported;           ///< Device supports hipHostRegister
-    int sparseHipArraySupported;         ///< Indicates if device supports sparse hip arrays
-    int hostRegisterReadOnlySupported;   ///< Device supports using the hipHostRegisterReadOnly flag
-                                         ///< with hipHostRegistger
-    int timelineSemaphoreInteropSupported;  ///< Indicates external timeline semaphore support
-    int memoryPoolsSupported;  ///< Indicates if device supports hipMallocAsync and hipMemPool APIs
-    int gpuDirectRDMASupported;                    ///< Indicates device support of RDMA APIs
-    unsigned int gpuDirectRDMAFlushWritesOptions;  ///< Bitmask to be interpreted according to
-                                                   ///< hipFlushGPUDirectRDMAWritesOptions
-    int gpuDirectRDMAWritesOrdering;               ///< value of hipGPUDirectRDMAWritesOrdering
-    unsigned int
-        memoryPoolSupportedHandleTypes;  ///< Bitmask of handle types support with mempool based IPC
-    int deferredMappingHipArraySupported;  ///< Device supports deferred mapping HIP arrays and HIP
-                                           ///< mipmapped arrays
-    int ipcEventSupported;                 ///< Device supports IPC events
-    int clusterLaunch;                     ///< Device supports cluster launch
-    int unifiedFunctionPointers;           ///< Indicates device supports unified function pointers
-    int reserved[63];                      ///< CUDA Reserved.
+typedef struct hipDeviceProp_t
+{
+  char name[256];                                // Device name.
+  hipUUID uuid;                                  // UUID of a device
+  char luid[8];                                  // 8-byte unique identifier. Only valid on windows
+  unsigned int luidDeviceNodeMask;               // LUID node mask
+  size_t totalGlobalMem;                         // Size of global memory region (in bytes).
+  size_t sharedMemPerBlock;                      // Size of shared memory per block (in bytes).
+  int regsPerBlock;                              // Registers per block.
+  int warpSize;                                  // Warp size.
+  size_t memPitch;                               // Maximum pitch in bytes allowed by memory copies pitched memory
+  int maxThreadsPerBlock;                        // Max work items per work group or workgroup max size.
+  int maxThreadsDim[3];                          // Max number of threads in each dimension (XYZ) of a block.
+  int maxGridSize[3];                            // Max grid dimensions (XYZ).
+  int clockRate;                                 // Max clock frequency of the multiProcessors in khz.
+  size_t totalConstMem;                          // Size of shared constant memory region on the device (in bytes).
+  int major;                                     // Major compute capability. On HCC, this is an approximation and features may
+                                                 // differ from CUDA CC.  See the arch feature flags for portable ways to query feature caps.
+  int minor;                                     // Minor compute capability. On HCC, this is an approximation and features may
+                                                 // differ from CUDA CC.  See the arch feature flags for portable ways to query feature caps.
+  size_t textureAlignment;                       // Alignment requirement for textures
+  size_t texturePitchAlignment;                  // Pitch alignment requirement for texture references bound to
+  int deviceOverlap;                             // Deprecated. Use asyncEngineCount instead
+  int multiProcessorCount;                       // Number of multi-processors (compute units).
+  int kernelExecTimeoutEnabled;                  // Run time limit for kernels executed on the device
+  int integrated;                                // APU vs dGPU
+  int canMapHostMemory;                          // Check whether HIP can map host memory
+  int computeMode;                               // Compute mode.
+  int maxTexture1D;                              // Maximum number of elements in 1D images
+  int maxTexture1DMipmap;                        // Maximum 1D mipmap texture size
+  int maxTexture1DLinear;                        // Maximum size for 1D textures bound to linear memory
+  int maxTexture2D[2];                           // Maximum dimensions (width, height) of 2D images, in image elements
+  int maxTexture2DMipmap[2];                     // Maximum number of elements in 2D array mipmap of images
+  int maxTexture2DLinear[3];                     // Maximum 2D tex dimensions if tex are bound to pitched memory
+  int maxTexture2DGather[2];                     // Maximum 2D tex dimensions if gather has to be performed
+  int maxTexture3D[3];                           // Maximum dimensions (width, height, depth) of 3D images, in image elements
+  int maxTexture3DAlt[3];                        // Maximum alternate 3D texture dims
+  int maxTextureCubemap;                         // Maximum cubemap texture dims
+  int maxTexture1DLayered[2];                    // Maximum number of elements in 1D array images
+  int maxTexture2DLayered[3];                    // Maximum number of elements in 2D array images
+  int maxTextureCubemapLayered[2];               // Maximum cubemaps layered texture dims
+  int maxSurface1D;                              // Maximum 1D surface size
+  int maxSurface2D[2];                           // Maximum 2D surface size
+  int maxSurface3D[3];                           // Maximum 3D surface size
+  int maxSurface1DLayered[2];                    // Maximum 1D layered surface size
+  int maxSurface2DLayered[3];                    // Maximum 2D layared surface size
+  int maxSurfaceCubemap;                         // Maximum cubemap surface size
+  int maxSurfaceCubemapLayered[2];               // Maximum cubemap layered surface size
+  size_t surfaceAlignment;                       // Alignment requirement for surface
+  int concurrentKernels;                         // Device can possibly execute multiple kernels concurrently.
+  int ECCEnabled;                                // Device has ECC support enabled
+  int pciBusID;                                  // PCI Bus ID.
+  int pciDeviceID;                               // PCI Device ID.
+  int pciDomainID;                               // PCI Domain ID
+  int tccDriver;                                 // 1:If device is Tesla device using TCC driver, else 0
+  int asyncEngineCount;                          // Number of async engines
+  int unifiedAddressing;                         // Does device and host share unified address space
+  int memoryClockRate;                           // Max global memory clock frequency in khz.
+  int memoryBusWidth;                            // Global memory bus width in bits.
+  int l2CacheSize;                               // L2 cache size.
+  int persistingL2CacheMaxSize;                  // Device's max L2 persisting lines in bytes
+  int maxThreadsPerMultiProcessor;               // Maximum resident threads per multi-processor.
+  int streamPrioritiesSupported;                 // Device supports stream priority
+  int globalL1CacheSupported;                    // Indicates globals are cached in L1
+  int localL1CacheSupported;                     // Locals are cahced in L1
+  size_t sharedMemPerMultiprocessor;             // Amount of shared memory available per multiprocessor.
+  int regsPerMultiprocessor;                     // registers available per multiprocessor
+  int managedMemory;                             // Device supports allocating managed memory on this system
+  int isMultiGpuBoard;                           // 1 if device is on a multi-GPU board, 0 if not.
+  int multiGpuBoardGroupID;                      // Unique identifier for a group of devices on same multiboard GPU
+  int hostNativeAtomicSupported;                 // Link between host and device supports native atomics
+  int singleToDoublePrecisionPerfRatio;          // Deprecated. CUDA only.
+  int pageableMemoryAccess;                      // Device supports coherently accessing pageable memory
+                                                 // without calling hipHostRegister on it
+  int concurrentManagedAccess;                   // Device can coherently access managed memory concurrently with the CPU
+  int computePreemptionSupported;                // Is compute preemption supported on the device
+  int canUseHostPointerForRegisteredMem;         // Device can access host registered memory with same address as the host
+  int cooperativeLaunch;                         // HIP device supports cooperative launch
+  int cooperativeMultiDeviceLaunch;              // HIP device supports cooperative launch on multiple devices
+  size_t sharedMemPerBlockOptin;                 // Per device m ax shared mem per block usable by special opt in
+  int pageableMemoryAccessUsesHostPageTables;    // Device accesses pageable memory via the host's page tables
+  int directManagedMemAccessFromHost;            // Host can directly access managed memory on the device without migration
+  int maxBlocksPerMultiProcessor;                // Max number of blocks on CU
+  int accessPolicyMaxWindowSize;                 // Max value of access policy window
+  size_t reservedSharedMemPerBlock;              // Shared memory reserved by driver per block
+  int hostRegisterSupported;                     // Device supports hipHostRegister
+  int sparseHipArraySupported;                   // Indicates if device supports sparse hip arrays
+  int hostRegisterReadOnlySupported;             // Device supports using the hipHostRegisterReadOnly flag with hipHostRegistger
+  int timelineSemaphoreInteropSupported;         // Indicates external timeline semaphore support
+  int memoryPoolsSupported;                      // Indicates if device supports hipMallocAsync and hipMemPool APIs
+  int gpuDirectRDMASupported;                    // Indicates device support of RDMA APIs
+  unsigned int gpuDirectRDMAFlushWritesOptions;  // Bitmask to be interpreted according to hipFlushGPUDirectRDMAWritesOptions
+  int gpuDirectRDMAWritesOrdering;               // value of hipGPUDirectRDMAWritesOrdering
+  unsigned int memoryPoolSupportedHandleTypes;   // Bitmask of handle types support with mempool based IPC
+  int deferredMappingHipArraySupported;          // Device supports deferred mapping HIP arrays and HIP mipmapped arrays
+  int ipcEventSupported;                         // Device supports IPC events
+  int clusterLaunch;                             // Device supports cluster launch
+  int unifiedFunctionPointers;                   // Indicates device supports unified function pointers
+  int reserved[63];                              // CUDA Reserved.
 
-    int hipReserved[32];  ///< Reserved for adding new entries for HIP/CUDA.
+  int hipReserved[32];                           // Reserved for adding new entries for HIP/CUDA.
+
+  /* HIP Only struct members */
+  char gcnArchName[256];                         // AMD GCN Arch Name. HIP Only.
+  size_t maxSharedMemoryPerMultiProcessor;       // Maximum Shared Memory Per CU. HIP Only.
+  int clockInstructionRate;                      // Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP.
+  hipDeviceArch_t arch;                          // Architectural feature flags.  New for HIP.
+  unsigned int* hdpMemFlushCntl;                 // Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
+  unsigned int* hdpRegFlushCntl;                 // Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
+  int cooperativeMultiDeviceUnmatchedFunc;       // HIP device supports cooperative launch on multiple devices with unmatched functions
+  int cooperativeMultiDeviceUnmatchedGridDim;    // HIP device supports cooperative launch on multiple devices with unmatched grid dimensions
+  int cooperativeMultiDeviceUnmatchedBlockDim;   // HIP device supports cooperative launch on multiple devices with unmatched block dimensions
+  int cooperativeMultiDeviceUnmatchedSharedMem;  // HIP device supports cooperative launch on multiple devices with unmatched shared memories
+  int isLargeBar;                                // 1: if it is a large PCI bar device, else 0
+  int asicRevision;                              // Revision of the GPU in this device
 
-    /* HIP Only struct members */
-    char gcnArchName[256];                    ///< AMD GCN Arch Name. HIP Only.
-    size_t maxSharedMemoryPerMultiProcessor;  ///< Maximum Shared Memory Per CU. HIP Only.
-    int clockInstructionRate;  ///< Frequency in khz of the timer used by the device-side "clock*"
-                               ///< instructions.  New for HIP.
-    hipDeviceArch_t arch;      ///< Architectural feature flags.  New for HIP.
-    unsigned int* hdpMemFlushCntl;            ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
-    unsigned int* hdpRegFlushCntl;            ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
-    int cooperativeMultiDeviceUnmatchedFunc;  ///< HIP device supports cooperative launch on
-                                              ///< multiple
-                                              /// devices with unmatched functions
-    int cooperativeMultiDeviceUnmatchedGridDim;    ///< HIP device supports cooperative launch on
-                                                   ///< multiple
-                                                   /// devices with unmatched grid dimensions
-    int cooperativeMultiDeviceUnmatchedBlockDim;   ///< HIP device supports cooperative launch on
-                                                   ///< multiple
-                                                   /// devices with unmatched block dimensions
-    int cooperativeMultiDeviceUnmatchedSharedMem;  ///< HIP device supports cooperative launch on
-                                                   ///< multiple
-                                                   /// devices with unmatched shared memories
-    int isLargeBar;                                ///< 1: if it is a large PCI bar device, else 0
-    int asicRevision;                              ///< Revision of the GPU in this device
 } hipDeviceProp_t;
 
 //Flags that can be used with hipStreamCreateWithFlags.
 /** Default stream creation flags. These are used with hipStreamCreate().*/
-#define hipStreamDefault  0x00
+#define hipStreamDefault 0x00
 
 /** Stream does not implicitly synchronize with null stream.*/
 #define hipStreamNonBlocking 0x01
@@ -511,46 +473,48 @@ typedef struct hipDeviceProp_t {
 
 /** Use a device-scope release when recording this event. This flag is useful to obtain more
  * precise timings of commands between events.  The flag is a no-op on CUDA platforms.*/
-#define hipEventReleaseToDevice  0x40000000
+#define hipEventReleaseToDevice 0x40000000
 
 /** Use a system-scope release when recording this event. This flag is useful to make
  * non-coherent host memory visible to the host. The flag is a no-op on CUDA platforms.*/
-#define hipEventReleaseToSystem  0x80000000
+#define hipEventReleaseToSystem 0x80000000
 
 /** Automatically select between Spin and Yield.*/
 #define hipDeviceScheduleAuto 0x0
 
 /** Dedicate a CPU core to spin-wait. Provides lowest latency, but burns a CPU core and may
  * consume more power.*/
-#define hipDeviceScheduleSpin  0x1
+#define hipDeviceScheduleSpin 0x1
 
 /** Yield the CPU to the operating system when waiting. May increase latency, but lowers power
  * and is friendlier to other threads in the system.*/
-#define hipDeviceScheduleYield  0x2
+#define hipDeviceScheduleYield 0x2
 #define hipDeviceScheduleBlockingSync 0x4
 #define hipDeviceScheduleMask 0x7
 #define hipDeviceMapHost 0x8
 #define hipDeviceLmemResizeToMax 0x16
 
-typedef enum hipJitOption {
-    hipJitOptionMaxRegisters = 0,
-    hipJitOptionThreadsPerBlock,
-    hipJitOptionWallTime,
-    hipJitOptionInfoLogBuffer,
-    hipJitOptionInfoLogBufferSizeBytes,
-    hipJitOptionErrorLogBuffer,
-    hipJitOptionErrorLogBufferSizeBytes,
-    hipJitOptionOptimizationLevel,
-    hipJitOptionTargetFromContext,
-    hipJitOptionTarget,
-    hipJitOptionFallbackStrategy,
-    hipJitOptionGenerateDebugInfo,
-    hipJitOptionLogVerbose,
-    hipJitOptionGenerateLineInfo,
-    hipJitOptionCacheMode,
-    hipJitOptionSm3xOpt,
-    hipJitOptionFastCompile,
-    hipJitOptionNumOptions
+typedef enum hipJitOption
+{
+  hipJitOptionMaxRegisters = 0,
+  hipJitOptionThreadsPerBlock,
+  hipJitOptionWallTime,
+  hipJitOptionInfoLogBuffer,
+  hipJitOptionInfoLogBufferSizeBytes,
+  hipJitOptionErrorLogBuffer,
+  hipJitOptionErrorLogBufferSizeBytes,
+  hipJitOptionOptimizationLevel,
+  hipJitOptionTargetFromContext,
+  hipJitOptionTarget,
+  hipJitOptionFallbackStrategy,
+  hipJitOptionGenerateDebugInfo,
+  hipJitOptionLogVerbose,
+  hipJitOptionGenerateLineInfo,
+  hipJitOptionCacheMode,
+  hipJitOptionSm3xOpt,
+  hipJitOptionFastCompile,
+  hipJitOptionNumOptions
+
 } hipJitOption;
 
 // stop: hip_runtime_api.h
@@ -563,11 +527,17 @@ typedef enum hipJitOption {
 
 #define HIP_API_CALL HIPAPI
 
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXCREATE)              (hipCtx_t *, unsigned int, hipDevice_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXDESTROY)             (hipCtx_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXPOPCURRENT)          (hipCtx_t *);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXPUSHCURRENT)         (hipCtx_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXSETCURRENT)          (hipCtx_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXSYNCHRONIZE)         (void);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGETATTRIBUTE)     (int *, hipDeviceAttribute_t, hipDevice_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGETCOUNT)         (int *);
@@ -575,7 +545,8 @@ typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGET)              (hipDevice_t *,
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGETNAME)          (char *, int, hipDevice_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICETOTALMEM)         (size_t *, hipDevice_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDRIVERGETVERSION)       (int *);
-typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTCREATE)            (hipEvent_t *, unsigned int);
+typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTCREATE)            (hipEvent_t *);
+typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTCREATEWITHFLAGS)   (hipEvent_t *, unsigned int);
 typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTDESTROY)           (hipEvent_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTELAPSEDTIME)       (float *, hipEvent_t, hipEvent_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTRECORD)            (hipEvent_t, hipStream_t);
@@ -603,21 +574,30 @@ typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEGETGLOBAL)        (hipDeviceptr_t
 typedef hipError_t (HIP_API_CALL *HIP_HIPMODULELOADDATAEX)       (hipModule_t *, const void *, unsigned int, hipJitOption *, void **);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEUNLOAD)           (hipModule_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPRUNTIMEGETVERSION)      (int *);
-typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMCREATE)           (hipStream_t *, unsigned int);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSETDEVICE)              (hipDevice_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSETDEVICEFLAGS)         (unsigned int);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMCREATE)           (hipStream_t *);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMCREATEWITHFLAGS)  (hipStream_t *, unsigned int);
 typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMDESTROY)          (hipStream_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMSYNCHRONIZE)      (hipStream_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPGETDEVICEPROPERTIES)    (hipDeviceProp_t *, hipDevice_t);
-typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR)    (int *, hipFunction_t, int, size_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR)  (int *, hipFunction_t, int, size_t);
 
 typedef struct hc_hip_lib
 {
   hc_dynlib_t lib;
 
+  // deprecated
   HIP_HIPCTXCREATE              hipCtxCreate;
+  // deprecated
   HIP_HIPCTXDESTROY             hipCtxDestroy;
+  // deprecated
   HIP_HIPCTXPOPCURRENT          hipCtxPopCurrent;
+  // deprecated
   HIP_HIPCTXPUSHCURRENT         hipCtxPushCurrent;
+  // deprecated
   HIP_HIPCTXSETCURRENT          hipCtxSetCurrent;
+  // deprecated
   HIP_HIPCTXSYNCHRONIZE         hipCtxSynchronize;
   HIP_HIPDEVICEGETATTRIBUTE     hipDeviceGetAttribute;
   HIP_HIPDEVICEGETCOUNT         hipDeviceGetCount;
@@ -626,6 +606,7 @@ typedef struct hc_hip_lib
   HIP_HIPDEVICETOTALMEM         hipDeviceTotalMem;
   HIP_HIPDRIVERGETVERSION       hipDriverGetVersion;
   HIP_HIPEVENTCREATE            hipEventCreate;
+  HIP_HIPEVENTCREATEWITHFLAGS   hipEventCreateWithFlags;
   HIP_HIPEVENTDESTROY           hipEventDestroy;
   HIP_HIPEVENTELAPSEDTIME       hipEventElapsedTime;
   HIP_HIPEVENTRECORD            hipEventRecord;
@@ -653,62 +634,75 @@ typedef struct hc_hip_lib
   HIP_HIPMODULELOADDATAEX       hipModuleLoadDataEx;
   HIP_HIPMODULEUNLOAD           hipModuleUnload;
   HIP_HIPRUNTIMEGETVERSION      hipRuntimeGetVersion;
+  HIP_HIPSETDEVICE              hipSetDevice;
+  HIP_HIPSETDEVICEFLAGS         hipSetDeviceFlags;
   HIP_HIPSTREAMCREATE           hipStreamCreate;
+  HIP_HIPSTREAMCREATEWITHFLAGS  hipStreamCreateWithFlags;
   HIP_HIPSTREAMDESTROY          hipStreamDestroy;
   HIP_HIPSTREAMSYNCHRONIZE      hipStreamSynchronize;
   HIP_HIPGETDEVICEPROPERTIES    hipGetDeviceProperties;
-  HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR 	hipModuleOccupancyMaxActiveBlocksPerMultiprocessor;
+  HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR  hipModuleOccupancyMaxActiveBlocksPerMultiprocessor;
 
 } hc_hip_lib_t;
 
 typedef hc_hip_lib_t HIP_PTR;
 
-int  hip_init                  (void *hashcat_ctx);
-void hip_close                 (void *hashcat_ctx);
+int  hip_init                   (void *hashcat_ctx);
+void hip_close                  (void *hashcat_ctx);
 
-int hc_hipCtxCreate            (void *hashcat_ctx, hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
-int hc_hipCtxDestroy           (void *hashcat_ctx, hipCtx_t ctx);
-int hc_hipCtxPopCurrent        (void *hashcat_ctx, hipCtx_t *pctx);
-int hc_hipCtxPushCurrent       (void *hashcat_ctx, hipCtx_t ctx);
-int hc_hipCtxSetCurrent        (void *hashcat_ctx, hipCtx_t ctx);
-int hc_hipCtxSynchronize       (void *hashcat_ctx);
-int hc_hipDeviceGet            (void *hashcat_ctx, hipDevice_t *device, int ordinal);
-int hc_hipDeviceGetAttribute   (void *hashcat_ctx, int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
-int hc_hipDeviceGetCount       (void *hashcat_ctx, int *count);
-int hc_hipDeviceGetName        (void *hashcat_ctx, char *name, int len, hipDevice_t dev);
-int hc_hipDeviceTotalMem       (void *hashcat_ctx, size_t *bytes, hipDevice_t dev);
-int hc_hipDriverGetVersion     (void *hashcat_ctx, int *driverVersion);
-int hc_hipEventCreate          (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flags);
-int hc_hipEventDestroy         (void *hashcat_ctx, hipEvent_t hEvent);
-int hc_hipEventElapsedTime     (void *hashcat_ctx, float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
-int hc_hipEventQuery           (void *hashcat_ctx, hipEvent_t hEvent);
-int hc_hipEventRecord          (void *hashcat_ctx, hipEvent_t hEvent, hipStream_t hStream);
-int hc_hipEventSynchronize     (void *hashcat_ctx, hipEvent_t hEvent);
-int hc_hipFuncGetAttribute     (void *hashcat_ctx, int *pi, hipFunction_attribute attrib, hipFunction_t hfunc);
-int hc_hipInit                 (void *hashcat_ctx, unsigned int Flags);
-int hc_hipLaunchKernel         (void *hashcat_ctx, hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
-int hc_hipMemAlloc             (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t bytesize);
-int hc_hipMemFree              (void *hashcat_ctx, hipDeviceptr_t dptr);
-int hc_hipMemGetInfo           (void *hashcat_ctx, size_t *free, size_t *total);
-int hc_hipMemcpyDtoD           (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount);
-int hc_hipMemcpyDtoH           (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
-int hc_hipMemcpyHtoD           (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
-int hc_hipMemsetD32            (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N);
-int hc_hipMemsetD8             (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N);
-int hc_hipMemcpyDtoDAsync      (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
-int hc_hipMemcpyDtoHAsync      (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
-int hc_hipMemcpyHtoDAsync      (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
-int hc_hipMemsetD32Async       (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N, hipStream_t hStream);
-int hc_hipMemsetD8Async        (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N, hipStream_t hStream);
-int hc_hipModuleGetFunction    (void *hashcat_ctx, hipFunction_t *hfunc, hipModule_t hmod, const char *name);
-int hc_hipModuleGetGlobal      (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
-int hc_hipModuleLoadDataEx     (void *hashcat_ctx, hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
-int hc_hipModuleUnload         (void *hashcat_ctx, hipModule_t hmod);
-int hc_hipRuntimeGetVersion    (void *hashcat_ctx, int *runtimeVersion);
-int hc_hipStreamCreate         (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags);
-int hc_hipStreamDestroy        (void *hashcat_ctx, hipStream_t hStream);
-int hc_hipStreamSynchronize    (void *hashcat_ctx, hipStream_t hStream);
-int hc_hipGetDeviceProperties  (void *hashcat_ctx, hipDeviceProp_t *prop, hipDevice_t dev);
+  // deprecated
+int hc_hipCtxCreate             (void *hashcat_ctx, hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
+  // deprecated
+int hc_hipCtxDestroy            (void *hashcat_ctx, hipCtx_t ctx);
+  // deprecated
+int hc_hipCtxPopCurrent         (void *hashcat_ctx, hipCtx_t *pctx);
+  // deprecated
+int hc_hipCtxPushCurrent        (void *hashcat_ctx, hipCtx_t ctx);
+  // deprecated
+int hc_hipCtxSetCurrent         (void *hashcat_ctx, hipCtx_t ctx);
+  // deprecated
+int hc_hipCtxSynchronize        (void *hashcat_ctx);
+int hc_hipDeviceGet             (void *hashcat_ctx, hipDevice_t *device, int ordinal);
+int hc_hipDeviceGetAttribute    (void *hashcat_ctx, int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
+int hc_hipDeviceGetCount        (void *hashcat_ctx, int *count);
+int hc_hipDeviceGetName         (void *hashcat_ctx, char *name, int len, hipDevice_t dev);
+int hc_hipDeviceTotalMem        (void *hashcat_ctx, size_t *bytes, hipDevice_t dev);
+int hc_hipDriverGetVersion      (void *hashcat_ctx, int *driverVersion);
+int hc_hipEventCreate           (void *hashcat_ctx, hipEvent_t *phEvent);
+int hc_hipEventCreateWithFlags  (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flags);
+int hc_hipEventDestroy          (void *hashcat_ctx, hipEvent_t hEvent);
+int hc_hipEventElapsedTime      (void *hashcat_ctx, float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
+int hc_hipEventQuery            (void *hashcat_ctx, hipEvent_t hEvent);
+int hc_hipEventRecord           (void *hashcat_ctx, hipEvent_t hEvent, hipStream_t hStream);
+int hc_hipEventSynchronize      (void *hashcat_ctx, hipEvent_t hEvent);
+int hc_hipFuncGetAttribute      (void *hashcat_ctx, int *pi, hipFunction_attribute attrib, hipFunction_t hfunc);
+int hc_hipInit                  (void *hashcat_ctx, unsigned int Flags);
+int hc_hipLaunchKernel          (void *hashcat_ctx, hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
+int hc_hipMemAlloc              (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t bytesize);
+int hc_hipMemFree               (void *hashcat_ctx, hipDeviceptr_t dptr);
+int hc_hipMemGetInfo            (void *hashcat_ctx, size_t *free, size_t *total);
+int hc_hipMemcpyDtoD            (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount);
+int hc_hipMemcpyDtoH            (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
+int hc_hipMemcpyHtoD            (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
+int hc_hipMemsetD32             (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N);
+int hc_hipMemsetD8              (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N);
+int hc_hipMemcpyDtoDAsync       (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
+int hc_hipMemcpyDtoHAsync       (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
+int hc_hipMemcpyHtoDAsync       (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
+int hc_hipMemsetD32Async        (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N, hipStream_t hStream);
+int hc_hipMemsetD8Async         (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N, hipStream_t hStream);
+int hc_hipModuleGetFunction     (void *hashcat_ctx, hipFunction_t *hfunc, hipModule_t hmod, const char *name);
+int hc_hipModuleGetGlobal       (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
+int hc_hipModuleLoadDataEx      (void *hashcat_ctx, hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
+int hc_hipModuleUnload          (void *hashcat_ctx, hipModule_t hmod);
+int hc_hipRuntimeGetVersion     (void *hashcat_ctx, int *runtimeVersion);
+int hc_hipSetDevice             (void *hashcat_ctx, hipDevice_t dev);
+int hc_hipSetDeviceFlags        (void *hashcat_ctx, unsigned int flags);
+int hc_hipStreamCreate          (void *hashcat_ctx, hipStream_t *phStream);
+int hc_hipStreamCreateWithFlags (void *hashcat_ctx, hipStream_t *phStream, unsigned int flags);
+int hc_hipStreamDestroy         (void *hashcat_ctx, hipStream_t hStream);
+int hc_hipStreamSynchronize     (void *hashcat_ctx, hipStream_t hStream);
+int hc_hipGetDeviceProperties   (void *hashcat_ctx, hipDeviceProp_t *prop, hipDevice_t dev);
 int hc_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor (void *hashcat_ctx, int *numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk);
 
 #endif // HC_EXT_HIP_H
diff --git a/src/autotune.c b/src/autotune.c
index 7938fb259..e8f704599 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -679,7 +679,7 @@ HC_API_CALL void *thread_autotune (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
   // check for autotune failure
@@ -695,11 +695,6 @@ HC_API_CALL void *thread_autotune (void *p)
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
   return NULL;
 }
 
diff --git a/src/backend.c b/src/backend.c
index 6e8fe4aad..6e84eed34 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -993,7 +993,7 @@ int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, c
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return -1;
 
     if (hc_hipMemcpyDtoH (hashcat_ctx, &pw_idx, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t)) == -1) return -1;
 
@@ -1059,11 +1059,6 @@ int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, c
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return -1;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return -1;
-  }
-
   return 0;
 }
 
@@ -1082,13 +1077,11 @@ int copy_pws_idx (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return -1;
 
     if (hc_hipMemcpyDtoH (hashcat_ctx, dest, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), (cnt * sizeof (pw_idx_t))) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
-
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return -1;
   }
 
   #if defined (__APPLE__)
@@ -1122,13 +1115,11 @@ int copy_pws_comp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return -1;
 
     if (hc_hipMemcpyDtoH (hashcat_ctx, dest, device_param->hip_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
-
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return -1;
   }
 
   #if defined (__APPLE__)
@@ -5937,24 +5928,24 @@ static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, int *virt
       device_param->has_prmt  = (sm >= 20) ? true : false;
       device_param->has_shfw  = (sm >= 70) ? true : false;
 
+      // one-time init cuda context
+
+      if (hc_cuCtxCreate (hashcat_ctx, &device_param->cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
       // device_available_mem
 
-      CUcontext cuda_context;
-
-      if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_cuCtxPushCurrent (hashcat_ctx, cuda_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
       size_t free  = 0;
       size_t total = 0;
 
@@ -5967,14 +5958,7 @@ static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, int *virt
 
       device_param->device_available_mem = ((u64) free * (100 - user_options->backend_devices_keepfree)) / 100;
 
-      if (hc_cuCtxPopCurrent (hashcat_ctx, &cuda_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_cuCtxDestroy (hashcat_ctx, cuda_context) == -1)
+      if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1)
       {
         device_param->skipped = true;
 
@@ -6440,24 +6424,24 @@ static void backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, int *virth
       device_param->has_prmt  = false;
       device_param->has_shfw  = true; // always reports false : prop.arch.hasFunnelShift;
 
+      // one-time init hip context
+
+      if (hc_hipSetDeviceFlags (hashcat_ctx, hipDeviceScheduleBlockingSync) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
       // device_available_mem
 
-      hipCtx_t hip_context;
-
-      if (hc_hipCtxCreate (hashcat_ctx, &hip_context, hipDeviceScheduleBlockingSync, device_param->hip_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_hipCtxPushCurrent (hashcat_ctx, hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
       size_t free  = 0;
       size_t total = 0;
 
@@ -6470,20 +6454,6 @@ static void backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, int *virth
 
       device_param->device_available_mem = ((u64) free * (100 - user_options->backend_devices_keepfree)) / 100;
 
-      if (hc_hipCtxPopCurrent (hashcat_ctx, &hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_hipCtxDestroy (hashcat_ctx, hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
       #if defined (__linux__)
       if (strchr (folder_config->cpath_real, ' ') != NULL)
       {
@@ -8623,13 +8593,9 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         if (device_param->skipped == true) continue;
       }
 
-      /**
-       * create command-queue
-       */
+      // one-time init metal command-queue
 
-      mtl_command_queue command_queue;
-
-      if (hc_mtlCreateCommandQueue (hashcat_ctx, device_param->metal_device, &command_queue) == -1)
+      if (hc_mtlCreateCommandQueue (hashcat_ctx, device_param->metal_device, &device_param->metal_command_queue) == -1)
       {
         device_param->skipped = true;
 
@@ -8678,11 +8644,11 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
           u8 tmp_host[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
-          if (hc_mtlMemcpyHtoD (hashcat_ctx, command_queue, tmp_device[c], 0, tmp_host, sizeof (tmp_host)) == -1) break;
-          if (hc_mtlMemcpyDtoH (hashcat_ctx, command_queue, tmp_host, tmp_device[c], 0, sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyHtoD (hashcat_ctx, device_param->metal_command_queue, tmp_device[c], 0, tmp_host, sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyDtoH (hashcat_ctx, device_param->metal_command_queue, tmp_host, tmp_device[c], 0, sizeof (tmp_host)) == -1) break;
 
-          if (hc_mtlMemcpyHtoD (hashcat_ctx, command_queue, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), tmp_host, sizeof (tmp_host)) == -1) break;
-          if (hc_mtlMemcpyDtoH (hashcat_ctx, command_queue, tmp_host, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyHtoD (hashcat_ctx, device_param->metal_command_queue, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), tmp_host, sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyDtoH (hashcat_ctx, device_param->metal_command_queue, tmp_host, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host)) == -1) break;
         }
 
         device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
@@ -8707,8 +8673,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         hcfree (tmp_device);
       }
 
-      hc_mtlReleaseCommandQueue (hashcat_ctx, command_queue);
-
       if (device_param->device_host_unified_memory == 1)
       {
         // so, we actually have only half the memory because we need the same buffers on host side
@@ -8734,11 +8698,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         if (device_param->skipped == true) continue;
       }
 
-      /**
-       * create context for each device
-       */
-
-      cl_context context;
+      // one-time init opencl context
 
       /*
       cl_context_properties properties[3];
@@ -8747,10 +8707,10 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       properties[1] = (cl_context_properties) device_param->opencl_platform;
       properties[2] = 0;
 
-      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context);
+      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context);
       */
 
-      if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context) == -1)
+      if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context) == -1)
       {
         device_param->skipped = true;
 
@@ -8760,13 +8720,9 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         continue;
       }
 
-      /**
-       * create command-queue
-       */
+      // one-time init open command-queue
 
-      cl_command_queue command_queue;
-
-      if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1)
+      if (hc_clCreateCommandQueue (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, CL_QUEUE_PROFILING_ENABLE, &device_param->opencl_command_queue) == -1)
       {
         device_param->skipped = true;
 
@@ -8781,17 +8737,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))
       {
         #define RUN_INSTRUCTION_CHECKS() \
-          device_param->has_vadd     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vaddc    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vadd_co  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vsub     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vsubb    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vsub_co  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vadd3    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vbfe     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32     %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vperm    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vadd     = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vaddc    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vadd_co  = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vsub     = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vsubb    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vsub_co  = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vadd3    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vbfe     = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32     %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vperm    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
 
         if (backend_devices_idx > 0)
         {
@@ -8979,7 +8935,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
               OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-              tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
+              tmp_device[c] = ocl->clCreateBuffer (device_param->opencl_context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
 
               if (CL_err != CL_SUCCESS)
               {
@@ -8992,11 +8948,11 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
               u8 tmp_host[8];
 
-              if (ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
-              if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueReadBuffer  (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueWriteBuffer (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
 
-              if (ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
-              if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueReadBuffer  (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueWriteBuffer (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
             }
 
             device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
@@ -9008,24 +8964,28 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
             // clean up
 
+            int r = 0;
+
             for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
             {
               if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
 
               if (tmp_device[c] != NULL)
               {
-                if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1;
+                if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) r = -1;
               }
             }
 
             hcfree (tmp_device);
+
+            if (r == -1)
+            {
+              // return -1 here is blocking, to be better evaluated
+              //return -1;
+            }
           }
         }
 
-        hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
-
-        hc_clReleaseContext (hashcat_ctx, context);
-
         if (device_param->device_host_unified_memory == 1)
         {
           // so, we actually have only half the memory because we need the same buffers on host side
@@ -9065,23 +9025,61 @@ void backend_ctx_devices_destroy (hashcat_ctx_t *hashcat_ctx)
     hcfree (backend_ctx->opencl_platforms_version[opencl_platforms_idx]);
   }
 
+  // one-time release context/command-queue from all runtimes
+
   for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
   {
     hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
     hcfree (device_param->device_name);
 
+    if (device_param->is_cuda == true)
+    {
+      if (device_param->cuda_context)
+      {
+        hc_cuCtxDestroy (hashcat_ctx, device_param->cuda_context);
+
+        device_param->cuda_context = NULL;
+      }
+    }
+
+    if (device_param->is_hip == true)
+    {
+      hcfree (device_param->gcnArchName);
+    }
+
+    #if defined (__APPLE__)
+    if (device_param->is_metal == true)
+    {
+      if (device_param->metal_command_queue)
+      {
+        hc_mtlReleaseCommandQueue (hashcat_ctx, device_param->metal_command_queue);
+
+        device_param->metal_command_queue = NULL;
+      }
+    }
+    #endif
+
     if (device_param->is_opencl == true)
     {
       hcfree (device_param->opencl_driver_version);
       hcfree (device_param->opencl_device_version);
       hcfree (device_param->opencl_device_c_version);
       hcfree (device_param->opencl_device_vendor);
-    }
 
-    if (device_param->is_hip == true)
-    {
-      hcfree (device_param->gcnArchName);
+      if (device_param->opencl_command_queue)
+      {
+        hc_clReleaseCommandQueue (hashcat_ctx, device_param->opencl_command_queue);
+
+        device_param->opencl_command_queue = NULL;
+      }
+
+      if (device_param->opencl_context)
+      {
+        hc_clReleaseContext (hashcat_ctx, device_param->opencl_context);
+
+        device_param->opencl_context = NULL;
+      }
     }
   }
 
@@ -10730,93 +10728,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     }
     #endif
 
-    /**
-     * create context for each device
-     */
-
-    if (device_param->is_cuda == true)
-    {
-      if (hc_cuCtxCreate (hashcat_ctx, &device_param->cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-
-    if (device_param->is_hip == true)
-    {
-      if (hc_hipCtxCreate (hashcat_ctx, &device_param->hip_context, hipDeviceScheduleBlockingSync, device_param->hip_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-
-    #if defined (__APPLE__)
-    if (device_param->is_metal == true)
-    {
-      /**
-       * create command-queue
-       */
-
-      if (hc_mtlCreateCommandQueue (hashcat_ctx, device_param->metal_device, &device_param->metal_command_queue) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-    #endif
-
-    if (device_param->is_opencl == true)
-    {
-      /*
-      cl_context_properties properties[3];
-
-      properties[0] = CL_CONTEXT_PLATFORM;
-      properties[1] = (cl_context_properties) device_param->opencl_platform;
-      properties[2] = 0;
-
-      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context);
-      */
-
-      if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      /**
-       * create command-queue
-       */
-
-      // not supported with NV
-      // device_param->opencl_command_queue = hc_clCreateCommandQueueWithProperties (hashcat_ctx, device_param->opencl_device, NULL);
-
-      if (hc_clCreateCommandQueue (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, CL_QUEUE_PROFILING_ENABLE, &device_param->opencl_command_queue) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
+    // re-using context/command-queue, there is no need to re-initialize them
 
     /**
      * create stream for CUDA devices
@@ -10824,6 +10736,13 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_cuda == true)
     {
+      if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
       if (hc_cuStreamCreate (hashcat_ctx, &device_param->cuda_stream, CU_STREAM_DEFAULT) == -1)
       {
         device_param->skipped = true;
@@ -10838,7 +10757,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipStreamCreate (hashcat_ctx, &device_param->hip_stream, hipStreamDefault) == -1)
+      if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      if (hc_hipStreamCreateWithFlags (hashcat_ctx, &device_param->hip_stream, hipStreamDefault) == -1)
       {
         device_param->skipped = true;
 
@@ -10880,21 +10806,21 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipEventCreate (hashcat_ctx, &device_param->hip_event1, hipEventBlockingSync) == -1)
+      if (hc_hipEventCreateWithFlags (hashcat_ctx, &device_param->hip_event1, hipEventBlockingSync) == -1)
       {
         device_param->skipped = true;
 
         continue;
       }
 
-      if (hc_hipEventCreate (hashcat_ctx, &device_param->hip_event2, hipEventBlockingSync) == -1)
+      if (hc_hipEventCreateWithFlags (hashcat_ctx, &device_param->hip_event2, hipEventBlockingSync) == -1)
       {
         device_param->skipped = true;
 
         continue;
       }
 
-      if (hc_hipEventCreate (hashcat_ctx, &device_param->hip_event3, hipEventDisableTiming) == -1)
+      if (hc_hipEventCreateWithFlags (hashcat_ctx, &device_param->hip_event3, hipEventDisableTiming) == -1)
       {
         device_param->skipped = true;
 
@@ -16927,16 +16853,6 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       }
     }
 
-    if (device_param->is_hip == true)
-    {
-      if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-
     hardware_power_all += hardware_power_max;
 
     EVENT_DATA (EVENT_BACKEND_DEVICE_INIT_POST, &backend_devices_idx, sizeof (int));
@@ -17058,7 +16974,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->cuda_module_amp)       hc_cuModuleUnload (hashcat_ctx, device_param->cuda_module_amp);
       if (device_param->cuda_module_shared)    hc_cuModuleUnload (hashcat_ctx, device_param->cuda_module_shared);
 
-      if (device_param->cuda_context)          hc_cuCtxDestroy (hashcat_ctx, device_param->cuda_context);
+      //if (device_param->cuda_context)          hc_cuCtxDestroy (hashcat_ctx, device_param->cuda_context);
 
       device_param->cuda_d_pws_buf            = 0;
       device_param->cuda_d_pws_amp_buf        = 0;
@@ -17135,7 +17051,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->cuda_module_amp           = NULL;
       device_param->cuda_module_shared        = NULL;
 
-      device_param->cuda_context              = NULL;
+      //device_param->cuda_context              = NULL;
     }
 
     if (device_param->is_hip == true)
@@ -17189,8 +17105,6 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->hip_module_amp)       hc_hipModuleUnload (hashcat_ctx, device_param->hip_module_amp);
       if (device_param->hip_module_shared)    hc_hipModuleUnload (hashcat_ctx, device_param->hip_module_shared);
 
-      if (device_param->hip_context)          hc_hipCtxDestroy (hashcat_ctx, device_param->hip_context);
-
       device_param->hip_d_pws_buf            = 0;
       device_param->hip_d_pws_amp_buf        = 0;
       device_param->hip_d_pws_comp_buf       = 0;
@@ -17265,8 +17179,6 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->hip_module_mp            = NULL;
       device_param->hip_module_amp           = NULL;
       device_param->hip_module_shared        = NULL;
-
-      device_param->hip_context              = NULL;
     }
 
     #if defined (__APPLE__)
@@ -17341,7 +17253,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->metal_library_amp)            hc_mtlReleaseLibrary (hashcat_ctx, device_param->metal_library_amp);
       if (device_param->metal_library_shared)         hc_mtlReleaseLibrary (hashcat_ctx, device_param->metal_library_shared);
 
-      if (device_param->metal_command_queue)          hc_mtlReleaseCommandQueue (hashcat_ctx, device_param->metal_command_queue);
+      //if (device_param->metal_command_queue)          hc_mtlReleaseCommandQueue (hashcat_ctx, device_param->metal_command_queue);
 
       //if (device_param->metal_device)                 hc_mtlReleaseDevice (hashcat_ctx, device_param->metal_device);
 
@@ -17411,7 +17323,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->metal_library_mp             = NULL;
       device_param->metal_library_amp            = NULL;
       device_param->metal_library_shared         = NULL;
-      device_param->metal_command_queue          = NULL;
+      //device_param->metal_command_queue          = NULL;
       //device_param->metal_device                 = NULL;
     }
     #endif // __APPLE__
@@ -17487,9 +17399,9 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->opencl_program_amp)      hc_clReleaseProgram (hashcat_ctx, device_param->opencl_program_amp);
       if (device_param->opencl_program_shared)   hc_clReleaseProgram (hashcat_ctx, device_param->opencl_program_shared);
 
-      if (device_param->opencl_command_queue)    hc_clReleaseCommandQueue (hashcat_ctx, device_param->opencl_command_queue);
+      //if (device_param->opencl_command_queue)    hc_clReleaseCommandQueue (hashcat_ctx, device_param->opencl_command_queue);
 
-      if (device_param->opencl_context)          hc_clReleaseContext (hashcat_ctx, device_param->opencl_context);
+      //if (device_param->opencl_context)          hc_clReleaseContext (hashcat_ctx, device_param->opencl_context);
 
       device_param->opencl_d_pws_buf           = NULL;
       device_param->opencl_d_pws_amp_buf       = NULL;
@@ -17557,8 +17469,8 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->opencl_program_mp          = NULL;
       device_param->opencl_program_amp         = NULL;
       device_param->opencl_program_shared      = NULL;
-      device_param->opencl_command_queue       = NULL;
-      device_param->opencl_context             = NULL;
+      //device_param->opencl_command_queue       = NULL;
+      //device_param->opencl_context             = NULL;
     }
 
     device_param->h_tmps              = NULL;
diff --git a/src/dispatch.c b/src/dispatch.c
index ad7c9851b..9ac171f31 100644
--- a/src/dispatch.c
+++ b/src/dispatch.c
@@ -381,7 +381,7 @@ HC_API_CALL void *thread_calc_stdin (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
   if (calc_stdin (hashcat_ctx, device_param) == -1)
@@ -396,11 +396,6 @@ HC_API_CALL void *thread_calc_stdin (void *p)
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
   if (bridge_ctx->enabled == true)
   {
     if (bridge_ctx->thread_term != BRIDGE_DEFAULT)
@@ -1685,7 +1680,7 @@ HC_API_CALL void *thread_calc (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
   if (calc (hashcat_ctx, device_param) == -1)
@@ -1700,11 +1695,6 @@ HC_API_CALL void *thread_calc (void *p)
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
   if (bridge_ctx->enabled == true)
   {
     if (bridge_ctx->thread_term != BRIDGE_DEFAULT)
diff --git a/src/ext_hip.c b/src/ext_hip.c
index 4a26a7e9f..9e7eb00ab 100644
--- a/src/ext_hip.c
+++ b/src/ext_hip.c
@@ -115,51 +115,55 @@ int hip_init (void *hashcat_ctx)
   // a good reference is cuda.h itself
   // this needs to be verified for each new cuda release
 
-  HC_LOAD_FUNC_HIP (hip, hipCtxCreate,              hipCtxCreate,               HIP_HIPCTXCREATE,               HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxDestroy,             hipCtxDestroy,              HIP_HIPCTXDESTROY,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent,          hipCtxPopCurrent,           HIP_HIPCTXPOPCURRENT,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent,         hipCtxPushCurrent,          HIP_HIPCTXPUSHCURRENT,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent,          hipCtxSetCurrent,           HIP_HIPCTXSETCURRENT,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize,         hipCtxSynchronize,          HIP_HIPCTXSYNCHRONIZE,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGet,              hipDeviceGet,               HIP_HIPDEVICEGET,               HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute,     hipDeviceGetAttribute,      HIP_HIPDEVICEGETATTRIBUTE,      HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount,         hipGetDeviceCount,          HIP_HIPDEVICEGETCOUNT,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetName,          hipDeviceGetName,           HIP_HIPDEVICEGETNAME,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceTotalMem,         hipDeviceTotalMem,          HIP_HIPDEVICETOTALMEM,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDriverGetVersion,       hipDriverGetVersion,        HIP_HIPDRIVERGETVERSION,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventCreate,            hipEventCreateWithFlags,    HIP_HIPEVENTCREATE,             HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventDestroy,           hipEventDestroy,            HIP_HIPEVENTDESTROY,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventElapsedTime,       hipEventElapsedTime,        HIP_HIPEVENTELAPSEDTIME,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventRecord,            hipEventRecord,             HIP_HIPEVENTRECORD,             HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventSynchronize,       hipEventSynchronize,        HIP_HIPEVENTSYNCHRONIZE,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute,       hipFuncGetAttribute,        HIP_HIPFUNCGETATTRIBUTE,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetErrorName,           hipDrvGetErrorName,         HIP_HIPGETERRORNAME,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetErrorString,         hipDrvGetErrorString,       HIP_HIPGETERRORSTRING,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipInit,                   hipInit,                    HIP_HIPINIT,                    HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipLaunchKernel,           hipModuleLaunchKernel,      HIP_HIPLAUNCHKERNEL,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemAlloc,               hipMalloc,                  HIP_HIPMEMALLOC,                HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemFree,                hipFree,                    HIP_HIPMEMFREE,                 HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemGetInfo,             hipMemGetInfo,              HIP_HIPMEMGETINFO,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoD,             hipMemcpyDtoD,              HIP_HIPMEMCPYDTOD,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoH,             hipMemcpyDtoH,              HIP_HIPMEMCPYDTOH,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoD,             hipMemcpyHtoD,              HIP_HIPMEMCPYHTOD,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD32,              hipMemsetD32,               HIP_HIPMEMSETD32,               HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD8,               hipMemsetD8,                HIP_HIPMEMSETD8,                HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoDAsync,        hipMemcpyDtoDAsync,         HIP_HIPMEMCPYDTODASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoHAsync,        hipMemcpyDtoHAsync,         HIP_HIPMEMCPYDTOHASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,         HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD32Async,         hipMemsetD32Async,          HIP_HIPMEMSETD32ASYNC,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD8Async,          hipMemsetD8Async,           HIP_HIPMEMSETD8ASYNC,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction,      hipModuleGetFunction,       HIP_HIPMODULEGETFUNCTION,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal,        hipModuleGetGlobal,         HIP_HIPMODULEGETGLOBAL,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx,       hipModuleLoadDataEx,        HIP_HIPMODULELOADDATAEX,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleUnload,           hipModuleUnload,            HIP_HIPMODULEUNLOAD,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipRuntimeGetVersion,      hipRuntimeGetVersion,       HIP_HIPRUNTIMEGETVERSION,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamCreate,           hipStreamCreate,            HIP_HIPSTREAMCREATE,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamDestroy,          hipStreamDestroy,           HIP_HIPSTREAMDESTROY,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize,      hipStreamSynchronize,       HIP_HIPSTREAMSYNCHRONIZE,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetDeviceProperties,    hipGetDevicePropertiesR0600,     HIP_HIPGETDEVICEPROPERTIES,     HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor,    hipModuleOccupancyMaxActiveBlocksPerMultiprocessor,     HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR,     HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxCreate,              hipCtxCreate,                 HIP_HIPCTXCREATE,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxDestroy,             hipCtxDestroy,                HIP_HIPCTXDESTROY,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent,          hipCtxPopCurrent,             HIP_HIPCTXPOPCURRENT,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent,         hipCtxPushCurrent,            HIP_HIPCTXPUSHCURRENT,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent,          hipCtxSetCurrent,             HIP_HIPCTXSETCURRENT,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize,         hipCtxSynchronize,            HIP_HIPCTXSYNCHRONIZE,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGet,              hipDeviceGet,                 HIP_HIPDEVICEGET,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute,     hipDeviceGetAttribute,        HIP_HIPDEVICEGETATTRIBUTE,      HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount,         hipGetDeviceCount,            HIP_HIPDEVICEGETCOUNT,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetName,          hipDeviceGetName,             HIP_HIPDEVICEGETNAME,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceTotalMem,         hipDeviceTotalMem,            HIP_HIPDEVICETOTALMEM,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDriverGetVersion,       hipDriverGetVersion,          HIP_HIPDRIVERGETVERSION,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventCreate,            hipEventCreate,               HIP_HIPEVENTCREATE,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventCreateWithFlags,   hipEventCreateWithFlags,      HIP_HIPEVENTCREATEWITHFLAGS,    HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventDestroy,           hipEventDestroy,              HIP_HIPEVENTDESTROY,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventElapsedTime,       hipEventElapsedTime,          HIP_HIPEVENTELAPSEDTIME,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventRecord,            hipEventRecord,               HIP_HIPEVENTRECORD,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventSynchronize,       hipEventSynchronize,          HIP_HIPEVENTSYNCHRONIZE,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute,       hipFuncGetAttribute,          HIP_HIPFUNCGETATTRIBUTE,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorName,           hipDrvGetErrorName,           HIP_HIPGETERRORNAME,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorString,         hipDrvGetErrorString,         HIP_HIPGETERRORSTRING,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipInit,                   hipInit,                      HIP_HIPINIT,                    HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipLaunchKernel,           hipModuleLaunchKernel,        HIP_HIPLAUNCHKERNEL,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemAlloc,               hipMalloc,                    HIP_HIPMEMALLOC,                HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemFree,                hipFree,                      HIP_HIPMEMFREE,                 HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemGetInfo,             hipMemGetInfo,                HIP_HIPMEMGETINFO,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoD,             hipMemcpyDtoD,                HIP_HIPMEMCPYDTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoH,             hipMemcpyDtoH,                HIP_HIPMEMCPYDTOH,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoD,             hipMemcpyHtoD,                HIP_HIPMEMCPYHTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD32,              hipMemsetD32,                 HIP_HIPMEMSETD32,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD8,               hipMemsetD8,                  HIP_HIPMEMSETD8,                HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoDAsync,        hipMemcpyDtoDAsync,           HIP_HIPMEMCPYDTODASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoHAsync,        hipMemcpyDtoHAsync,           HIP_HIPMEMCPYDTOHASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,           HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD32Async,         hipMemsetD32Async,            HIP_HIPMEMSETD32ASYNC,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD8Async,          hipMemsetD8Async,             HIP_HIPMEMSETD8ASYNC,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction,      hipModuleGetFunction,         HIP_HIPMODULEGETFUNCTION,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal,        hipModuleGetGlobal,           HIP_HIPMODULEGETGLOBAL,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx,       hipModuleLoadDataEx,          HIP_HIPMODULELOADDATAEX,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleUnload,           hipModuleUnload,              HIP_HIPMODULEUNLOAD,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipRuntimeGetVersion,      hipRuntimeGetVersion,         HIP_HIPRUNTIMEGETVERSION,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipSetDevice,              hipSetDevice,                 HIP_HIPSETDEVICE,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipSetDeviceFlags,         hipSetDeviceFlags,            HIP_HIPSETDEVICEFLAGS,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamCreate,           hipStreamCreate,              HIP_HIPSTREAMCREATE,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamCreateWithFlags,  hipStreamCreateWithFlags,     HIP_HIPSTREAMCREATEWITHFLAGS,   HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamDestroy,          hipStreamDestroy,             HIP_HIPSTREAMDESTROY,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize,      hipStreamSynchronize,         HIP_HIPSTREAMSYNCHRONIZE,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetDeviceProperties,    hipGetDevicePropertiesR0600,  HIP_HIPGETDEVICEPROPERTIES,     HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR, HIP, 1);
 
   return 0;
 }
@@ -507,13 +511,13 @@ int hc_hipDriverGetVersion (void *hashcat_ctx, int *driverVersion)
   return 0;
 }
 
-int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flags)
+int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
 
   HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const hipError_t HIP_err = hip->hipEventCreate (phEvent, Flags);
+  const hipError_t HIP_err = hip->hipEventCreate (phEvent);
 
   if (HIP_err != hipSuccess)
   {
@@ -534,6 +538,33 @@ int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flag
   return 0;
 }
 
+int hc_hipEventCreateWithFlags (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipEventCreateWithFlags (phEvent, flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipEventCreateWithFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipEventCreateWithFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_hipEventDestroy (void *hashcat_ctx, hipEvent_t hEvent)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
@@ -1211,13 +1242,67 @@ int hc_hipRuntimeGetVersion (void *hashcat_ctx, int *runtimeVersion)
   return 0;
 }
 
-int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags)
+int hc_hipSetDevice (void *hashcat_ctx, hipDevice_t dev)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
 
   HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const hipError_t HIP_err = hip->hipStreamCreate (phStream, Flags);
+  const hipError_t HIP_err = hip->hipSetDevice (dev);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipSetDevice(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipSetDevice(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipSetDeviceFlags (void *hashcat_ctx, unsigned int flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipSetDeviceFlags (flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipSetDeviceFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipSetDeviceFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipStreamCreate (phStream);
 
   if (HIP_err != hipSuccess)
   {
@@ -1238,6 +1323,33 @@ int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream, unsigned int F
   return 0;
 }
 
+int hc_hipStreamCreateWithFlags (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipStreamCreateWithFlags (phStream, Flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipStreamCreateWithFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipStreamCreateWithFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_hipStreamDestroy (void *hashcat_ctx, hipStream_t hStream)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
diff --git a/src/selftest.c b/src/selftest.c
index 5e744f88b..6c63a3418 100644
--- a/src/selftest.c
+++ b/src/selftest.c
@@ -1272,7 +1272,7 @@ HC_API_CALL void *thread_selftest (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
   const int rc_selftest = process_selftest (hashcat_ctx, device_param);
@@ -1303,8 +1303,6 @@ HC_API_CALL void *thread_selftest (void *p)
   if (device_param->is_hip == true)
   {
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return NULL;
-
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
   }
 
   if (bridge_ctx->enabled == true)