diff --git a/docs/changes.txt b/docs/changes.txt
index 94f7ef326..9c85f01e8 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -39,6 +39,7 @@
 - Unit tests: Updated test.sh to set default device-type to CPU with Apple Intel and added -f (--force) option
 - OpenCL Backend: moved functions to ext_OpenCL.c and includes to ext_OpenCL.h
 - HIP Backend: moved functions to ext_hip.c/ext_hiprtc.c and includes to ext_hip.h/ext_hiprtc.h
+- CUDA Backend: moved functions to ext_cuda.c/ext_nvrtc.c and includes to ext_cuda.h/ext_nvrtc.h
 
 * changes v6.2.4 -> v6.2.5
 
diff --git a/include/backend.h b/include/backend.h
index 0f1e23133..c7e7de0a1 100644
--- a/include/backend.h
+++ b/include/backend.h
@@ -23,65 +23,34 @@ static const char CL_VENDOR_MESA[]              = "Mesa";
 static const char CL_VENDOR_NV[]                = "NVIDIA Corporation";
 static const char CL_VENDOR_POCL[]              = "The pocl project";
 
-int  cuda_init    (hashcat_ctx_t *hashcat_ctx);
-void cuda_close   (hashcat_ctx_t *hashcat_ctx);
+int  backend_ctx_init                       (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_destroy                    (hashcat_ctx_t *hashcat_ctx);
 
-int  nvrtc_init   (hashcat_ctx_t *hashcat_ctx);
-void nvrtc_close  (hashcat_ctx_t *hashcat_ctx);
+int  backend_ctx_devices_init               (hashcat_ctx_t *hashcat_ctx, const int comptime);
+void backend_ctx_devices_destroy            (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_devices_sync_tuning        (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_devices_update_power       (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_devices_kernel_loops       (hashcat_ctx_t *hashcat_ctx);
 
-int hc_nvrtcCreateProgram        (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames);
-int hc_nvrtcDestroyProgram       (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog);
-int hc_nvrtcCompileProgram       (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, int numOptions, const char * const *options);
-int hc_nvrtcGetProgramLogSize    (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *logSizeRet);
-int hc_nvrtcGetProgramLog        (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *log);
-int hc_nvrtcGetPTXSize           (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *ptxSizeRet);
-int hc_nvrtcGetPTX               (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *ptx);
-int hc_nvrtcVersion              (hashcat_ctx_t *hashcat_ctx, int *major, int *minor);
+int  backend_session_begin                  (hashcat_ctx_t *hashcat_ctx);
+void backend_session_destroy                (hashcat_ctx_t *hashcat_ctx);
+void backend_session_reset                  (hashcat_ctx_t *hashcat_ctx);
+int  backend_session_update_combinator      (hashcat_ctx_t *hashcat_ctx);
+int  backend_session_update_mp              (hashcat_ctx_t *hashcat_ctx);
+int  backend_session_update_mp_rl           (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_l, const u32 css_cnt_r);
 
-int hc_cuCtxCreate               (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev);
-int hc_cuCtxDestroy              (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
-int hc_cuCtxSetCurrent           (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
-int hc_cuCtxSetCacheConfig       (hashcat_ctx_t *hashcat_ctx, CUfunc_cache config);
-int hc_cuCtxSynchronize          (hashcat_ctx_t *hashcat_ctx);
-int hc_cuDeviceGetAttribute      (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev);
-int hc_cuDeviceGetCount          (hashcat_ctx_t *hashcat_ctx, int *count);
-int hc_cuDeviceGet               (hashcat_ctx_t *hashcat_ctx, CUdevice *device, int ordinal);
-int hc_cuDeviceGetName           (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevice dev);
-int hc_cuDeviceTotalMem          (hashcat_ctx_t *hashcat_ctx, size_t *bytes, CUdevice dev);
-int hc_cuDriverGetVersion        (hashcat_ctx_t *hashcat_ctx, int *driverVersion);
-int hc_cuEventCreate             (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags);
-int hc_cuEventDestroy            (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
-int hc_cuEventElapsedTime        (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd);
-int hc_cuEventQuery              (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
-int hc_cuEventRecord             (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream);
-int hc_cuEventSynchronize        (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
-int hc_cuFuncGetAttribute        (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc);
-int hc_cuFuncSetAttribute        (hashcat_ctx_t *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value);
-int hc_cuInit                    (hashcat_ctx_t *hashcat_ctx, unsigned int Flags);
-int hc_cuLaunchKernel            (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
-int hc_cuMemAlloc                (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
-int hc_cuMemcpyDtoDAsync         (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-int hc_cuMemcpyDtoHAsync         (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-int hc_cuMemcpyHtoDAsync         (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-int hc_cuMemFree                 (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr);
-int hc_cuMemsetD32Async          (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
-int hc_cuMemsetD8Async           (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
-int hc_cuModuleGetFunction       (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
-int hc_cuModuleLoadDataEx        (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-int hc_cuModuleUnload            (hashcat_ctx_t *hashcat_ctx, CUmodule hmod);
-int hc_cuStreamCreate            (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags);
-int hc_cuStreamDestroy           (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
-int hc_cuStreamSynchronize       (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
-int hc_cuCtxPushCurrent          (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
-int hc_cuCtxPopCurrent           (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx);
-int hc_cuLinkCreate              (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
-int hc_cuLinkAddData             (hashcat_ctx_t *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
-int hc_cuLinkDestroy             (hashcat_ctx_t *hashcat_ctx, CUlinkState state);
-int hc_cuLinkComplete            (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut);
+void generate_source_kernel_filename        (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *shared_dir, char *source_file);
+void generate_cached_kernel_filename        (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *cache_dir, const char *device_name_chksum, char *cached_file);
+void generate_source_kernel_shared_filename (char *shared_dir, char *source_file);
+void generate_cached_kernel_shared_filename (char *cache_dir, const char *device_name_chksum, char *cached_file);
+void generate_source_kernel_mp_filename     (const u32 opti_type, const u64 opts_type, char *shared_dir, char *source_file);
+void generate_cached_kernel_mp_filename     (const u32 opti_type, const u64 opts_type, char *cache_dir, const char *device_name_chksum, char *cached_file);
+void generate_source_kernel_amp_filename    (const u32 attack_kern, char *shared_dir, char *source_file);
+void generate_cached_kernel_amp_filename    (const u32 attack_kern, char *cache_dir, const char *device_name_chksum, char *cached_file);
 
-int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 gidd, pw_t *pw);
+int gidd_to_pw_t                    (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 gidd, pw_t *pw);
 
-int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 highest_pw_len, const u64 pws_pos, const u64 pws_cnt, const u32 fast_iteration, const u32 salt_pos);
+int choose_kernel                   (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 highest_pw_len, const u64 pws_pos, const u64 pws_cnt, const u32 fast_iteration, const u32 salt_pos);
 
 int run_cuda_kernel_atinit          (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num);
 int run_cuda_kernel_utf8toutf16le   (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num);
@@ -101,38 +70,13 @@ int run_opencl_kernel_memset        (hashcat_ctx_t *hashcat_ctx, hc_device_param
 int run_opencl_kernel_memset32      (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 offset, const u32 value, const u64 size);
 int run_opencl_kernel_bzero         (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size);
 
-int run_kernel                (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 pws_pos, const u64 num, const u32 event_update, const u32 iteration);
-int run_kernel_mp             (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num);
-int run_kernel_tm             (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param);
-int run_kernel_amp            (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num);
-int run_kernel_decompress     (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num);
-int run_copy                  (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt);
-int run_cracker               (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_pos, const u64 pws_cnt);
-
-void generate_source_kernel_filename        (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *shared_dir, char *source_file);
-void generate_cached_kernel_filename        (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *cache_dir, const char *device_name_chksum, char *cached_file);
-void generate_source_kernel_shared_filename (char *shared_dir, char *source_file);
-void generate_cached_kernel_shared_filename (char *cache_dir, const char *device_name_chksum, char *cached_file);
-void generate_source_kernel_mp_filename     (const u32 opti_type, const u64 opts_type, char *shared_dir, char *source_file);
-void generate_cached_kernel_mp_filename     (const u32 opti_type, const u64 opts_type, char *cache_dir, const char *device_name_chksum, char *cached_file);
-void generate_source_kernel_amp_filename    (const u32 attack_kern, char *shared_dir, char *source_file);
-void generate_cached_kernel_amp_filename    (const u32 attack_kern, char *cache_dir, const char *device_name_chksum, char *cached_file);
-
-int  backend_ctx_init                  (hashcat_ctx_t *hashcat_ctx);
-void backend_ctx_destroy               (hashcat_ctx_t *hashcat_ctx);
-
-int  backend_ctx_devices_init          (hashcat_ctx_t *hashcat_ctx, const int comptime);
-void backend_ctx_devices_destroy       (hashcat_ctx_t *hashcat_ctx);
-void backend_ctx_devices_sync_tuning   (hashcat_ctx_t *hashcat_ctx);
-void backend_ctx_devices_update_power  (hashcat_ctx_t *hashcat_ctx);
-void backend_ctx_devices_kernel_loops  (hashcat_ctx_t *hashcat_ctx);
-
-int  backend_session_begin             (hashcat_ctx_t *hashcat_ctx);
-void backend_session_destroy           (hashcat_ctx_t *hashcat_ctx);
-void backend_session_reset             (hashcat_ctx_t *hashcat_ctx);
-int  backend_session_update_combinator (hashcat_ctx_t *hashcat_ctx);
-int  backend_session_update_mp         (hashcat_ctx_t *hashcat_ctx);
-int  backend_session_update_mp_rl      (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_l, const u32 css_cnt_r);
+int run_kernel                      (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 pws_pos, const u64 num, const u32 event_update, const u32 iteration);
+int run_kernel_mp                   (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num);
+int run_kernel_tm                   (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param);
+int run_kernel_amp                  (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num);
+int run_kernel_decompress           (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num);
+int run_copy                        (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt);
+int run_cracker                     (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_pos, const u64 pws_cnt);
 
 void *hook12_thread (void *p);
 void *hook23_thread (void *p);
diff --git a/include/ext_cuda.h b/include/ext_cuda.h
index 11ed86048..a2d1736f0 100644
--- a/include/ext_cuda.h
+++ b/include/ext_cuda.h
@@ -34,502 +34,504 @@ typedef struct CUmod_st *CUmodule;                        /**< CUDA module */
 typedef struct CUstream_st *CUstream;                     /**< CUDA stream */
 typedef struct CUlinkState_st *CUlinkState;
 
-typedef enum cudaError_enum {
-    /**
-     * The API call returned with no errors. In the case of query calls, this
-     * also means that the operation being queried is complete (see
-     * ::cuEventQuery() and ::cuStreamQuery()).
-     */
-    CUDA_SUCCESS                              = 0,
-
-    /**
-     * This indicates that one or more of the parameters passed to the API call
-     * is not within an acceptable range of values.
-     */
-    CUDA_ERROR_INVALID_VALUE                  = 1,
-
-    /**
-     * The API call failed because it was unable to allocate enough memory to
-     * perform the requested operation.
-     */
-    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
-
-    /**
-     * This indicates that the CUDA driver has not been initialized with
-     * ::cuInit() or that initialization has failed.
-     */
-    CUDA_ERROR_NOT_INITIALIZED                = 3,
-
-    /**
-     * This indicates that the CUDA driver is in the process of shutting down.
-     */
-    CUDA_ERROR_DEINITIALIZED                  = 4,
-
-    /**
-     * This indicates profiler is not initialized for this run. This can
-     * happen when the application is running with external profiling tools
-     * like visual profiler.
-     */
-    CUDA_ERROR_PROFILER_DISABLED              = 5,
-
-    /**
-     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
-     * to attempt to enable/disable the profiling via ::cuProfilerStart or
-     * ::cuProfilerStop without initialization.
-     */
-    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
-
-    /**
-     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
-     * to call cuProfilerStart() when profiling is already enabled.
-     */
-    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
-
-    /**
-     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
-     * to call cuProfilerStop() when profiling is already disabled.
-     */
-    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
-
-    /**
-     * This indicates that no CUDA-capable devices were detected by the installed
-     * CUDA driver.
-     */
-    CUDA_ERROR_NO_DEVICE                      = 100,
-
-    /**
-     * This indicates that the device ordinal supplied by the user does not
-     * correspond to a valid CUDA device.
-     */
-    CUDA_ERROR_INVALID_DEVICE                 = 101,
-
-
-    /**
-     * This indicates that the device kernel image is invalid. This can also
-     * indicate an invalid CUDA module.
-     */
-    CUDA_ERROR_INVALID_IMAGE                  = 200,
-
-    /**
-     * This most frequently indicates that there is no context bound to the
-     * current thread. This can also be returned if the context passed to an
-     * API call is not a valid handle (such as a context that has had
-     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
-     * mixes different API versions (i.e. 3010 context with 3020 API calls).
-     * See ::cuCtxGetApiVersion() for more details.
-     */
-    CUDA_ERROR_INVALID_CONTEXT                = 201,
-
-    /**
-     * This indicated that the context being supplied as a parameter to the
-     * API call was already the active context.
-     * \deprecated
-     * This error return is deprecated as of CUDA 3.2. It is no longer an
-     * error to attempt to push the active context via ::cuCtxPushCurrent().
-     */
-    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
-
-    /**
-     * This indicates that a map or register operation has failed.
-     */
-    CUDA_ERROR_MAP_FAILED                     = 205,
-
-    /**
-     * This indicates that an unmap or unregister operation has failed.
-     */
-    CUDA_ERROR_UNMAP_FAILED                   = 206,
-
-    /**
-     * This indicates that the specified array is currently mapped and thus
-     * cannot be destroyed.
-     */
-    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
-
-    /**
-     * This indicates that the resource is already mapped.
-     */
-    CUDA_ERROR_ALREADY_MAPPED                 = 208,
-
-    /**
-     * This indicates that there is no kernel image available that is suitable
-     * for the device. This can occur when a user specifies code generation
-     * options for a particular CUDA source file that do not include the
-     * corresponding device configuration.
-     */
-    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
-
-    /**
-     * This indicates that a resource has already been acquired.
-     */
-    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
-
-    /**
-     * This indicates that a resource is not mapped.
-     */
-    CUDA_ERROR_NOT_MAPPED                     = 211,
-
-    /**
-     * This indicates that a mapped resource is not available for access as an
-     * array.
-     */
-    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
-
-    /**
-     * This indicates that a mapped resource is not available for access as a
-     * pointer.
-     */
-    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
-
-    /**
-     * This indicates that an uncorrectable ECC error was detected during
-     * execution.
-     */
-    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
-
-    /**
-     * This indicates that the ::CUlimit passed to the API call is not
-     * supported by the active device.
-     */
-    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
-
-    /**
-     * This indicates that the ::CUcontext passed to the API call can
-     * only be bound to a single CPU thread at a time but is already
-     * bound to a CPU thread.
-     */
-    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
-
-    /**
-     * This indicates that peer access is not supported across the given
-     * devices.
-     */
-    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,
-
-    /**
-     * This indicates that a PTX JIT compilation failed.
-     */
-    CUDA_ERROR_INVALID_PTX                    = 218,
-
-    /**
-     * This indicates an error with OpenGL or DirectX context.
-     */
-    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,
-
-    /**
-    * This indicates that an uncorrectable NVLink error was detected during the
-    * execution.
-    */
-    CUDA_ERROR_NVLINK_UNCORRECTABLE           = 220,
-
-    /**
-    * This indicates that the PTX JIT compiler library was not found.
-    */
-    CUDA_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
-
-    /**
-     * This indicates that the device kernel source is invalid.
-     */
-    CUDA_ERROR_INVALID_SOURCE                 = 300,
-
-    /**
-     * This indicates that the file specified was not found.
-     */
-    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
-
-    /**
-     * This indicates that a link to a shared object failed to resolve.
-     */
-    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
-
-    /**
-     * This indicates that initialization of a shared object failed.
-     */
-    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
-
-    /**
-     * This indicates that an OS call failed.
-     */
-    CUDA_ERROR_OPERATING_SYSTEM               = 304,
-
-    /**
-     * This indicates that a resource handle passed to the API call was not
-     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
-     */
-    CUDA_ERROR_INVALID_HANDLE                 = 400,
-
-    /**
-     * This indicates that a resource required by the API call is not in a
-     * valid state to perform the requested operation.
-     */
-    CUDA_ERROR_ILLEGAL_STATE                  = 401,
-
-    /**
-     * This indicates that a named symbol was not found. Examples of symbols
-     * are global/constant variable names, texture names, and surface names.
-     */
-    CUDA_ERROR_NOT_FOUND                      = 500,
-
-    /**
-     * This indicates that asynchronous operations issued previously have not
-     * completed yet. This result is not actually an error, but must be indicated
-     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
-     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
-     */
-    CUDA_ERROR_NOT_READY                      = 600,
-
-    /**
-     * While executing a kernel, the device encountered a
-     * load or store instruction on an invalid memory address.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_ILLEGAL_ADDRESS                = 700,
-
-    /**
-     * This indicates that a launch did not occur because it did not have
-     * appropriate resources. This error usually indicates that the user has
-     * attempted to pass too many arguments to the device kernel, or the
-     * kernel launch specifies too many threads for the kernel's register
-     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
-     * when a 32-bit int is expected) is equivalent to passing too many
-     * arguments and can also result in this error.
-     */
-    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
-
-    /**
-     * This indicates that the device kernel took too long to execute. This can
-     * only occur if timeouts are enabled - see the device attribute
-     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
-
-    /**
-     * This error indicates a kernel launch that uses an incompatible texturing
-     * mode.
-     */
-    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
-
-    /**
-     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
-     * trying to re-enable peer access to a context which has already
-     * had peer access to it enabled.
-     */
-    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,
-
-    /**
-     * This error indicates that ::cuCtxDisablePeerAccess() is
-     * trying to disable peer access which has not been enabled yet
-     * via ::cuCtxEnablePeerAccess().
-     */
-    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,
-
-    /**
-     * This error indicates that the primary context for the specified device
-     * has already been initialized.
-     */
-    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
-
-    /**
-     * This error indicates that the context current to the calling thread
-     * has been destroyed using ::cuCtxDestroy, or is a primary context which
-     * has not yet been initialized.
-     */
-    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
-
-    /**
-     * A device-side assert triggered during kernel execution. The context
-     * cannot be used anymore, and must be destroyed. All existing device
-     * memory allocations from this context are invalid and must be
-     * reconstructed if the program is to continue using CUDA.
-     */
-    CUDA_ERROR_ASSERT                         = 710,
-
-    /**
-     * This error indicates that the hardware resources required to enable
-     * peer access have been exhausted for one or more of the devices
-     * passed to ::cuCtxEnablePeerAccess().
-     */
-    CUDA_ERROR_TOO_MANY_PEERS                 = 711,
-
-    /**
-     * This error indicates that the memory range passed to ::cuMemHostRegister()
-     * has already been registered.
-     */
-    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
-
-    /**
-     * This error indicates that the pointer passed to ::cuMemHostUnregister()
-     * does not correspond to any currently registered memory region.
-     */
-    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,
-
-    /**
-     * While executing a kernel, the device encountered a stack error.
-     * This can be due to stack corruption or exceeding the stack size limit.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_HARDWARE_STACK_ERROR           = 714,
-
-    /**
-     * While executing a kernel, the device encountered an illegal instruction.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_ILLEGAL_INSTRUCTION            = 715,
-
-    /**
-     * While executing a kernel, the device encountered a load or store instruction
-     * on a memory address which is not aligned.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_MISALIGNED_ADDRESS             = 716,
-
-    /**
-     * While executing a kernel, the device encountered an instruction
-     * which can only operate on memory locations in certain address spaces
-     * (global, shared, or local), but was supplied a memory address not
-     * belonging to an allowed address space.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_INVALID_ADDRESS_SPACE          = 717,
-
-    /**
-     * While executing a kernel, the device program counter wrapped its address space.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_INVALID_PC                     = 718,
-
-    /**
-     * An exception occurred on the device while executing a kernel. Common
-     * causes include dereferencing an invalid device pointer and accessing
-     * out of bounds shared memory. Less common cases can be system specific - more
-     * information about these cases can be found in the system specific user guide.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_LAUNCH_FAILED                  = 719,
-
-    /**
-     * This error indicates that the number of blocks launched per grid for a kernel that was
-     * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
-     * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
-     * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
-     * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
-     */
-    CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
-
-    /**
-     * This error indicates that the attempted operation is not permitted.
-     */
-    CUDA_ERROR_NOT_PERMITTED                  = 800,
-
-    /**
-     * This error indicates that the attempted operation is not supported
-     * on the current system or device.
-     */
-    CUDA_ERROR_NOT_SUPPORTED                  = 801,
-
-    /**
-     * This error indicates that the system is not yet ready to start any CUDA
-     * work.  To continue using CUDA, verify the system configuration is in a
-     * valid state and all required driver daemons are actively running.
-     * More information about this error can be found in the system specific
-     * user guide.
-     */
-    CUDA_ERROR_SYSTEM_NOT_READY               = 802,
-
-    /**
-     * This error indicates that there is a mismatch between the versions of
-     * the display driver and the CUDA driver. Refer to the compatibility documentation
-     * for supported versions.
-     */
-    CUDA_ERROR_SYSTEM_DRIVER_MISMATCH         = 803,
-
-    /**
-     * This error indicates that the system was upgraded to run with forward compatibility
-     * but the visible hardware detected by CUDA does not support this configuration.
-     * Refer to the compatibility documentation for the supported hardware matrix or ensure
-     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
-     * environment variable.
-     */
-    CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
-
-    /**
-     * This error indicates that the operation is not permitted when
-     * the stream is capturing.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED     = 900,
-
-    /**
-     * This error indicates that the current capture sequence on the stream
-     * has been invalidated due to a previous error.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_INVALIDATED     = 901,
-
-    /**
-     * This error indicates that the operation would have resulted in a merge
-     * of two independent capture sequences.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_MERGE           = 902,
-
-    /**
-     * This error indicates that the capture was not initiated in this stream.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_UNMATCHED       = 903,
-
-    /**
-     * This error indicates that the capture sequence contains a fork that was
-     * not joined to the primary stream.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_UNJOINED        = 904,
-
-    /**
-     * This error indicates that a dependency would have been created which
-     * crosses the capture sequence boundary. Only implicit in-stream ordering
-     * dependencies are allowed to cross the boundary.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_ISOLATION       = 905,
-
-    /**
-     * This error indicates a disallowed implicit dependency on a current capture
-     * sequence from cudaStreamLegacy.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_IMPLICIT        = 906,
-
-    /**
-     * This error indicates that the operation is not permitted on an event which
-     * was last recorded in a capturing stream.
-     */
-    CUDA_ERROR_CAPTURED_EVENT                 = 907,
-
-    /**
-     * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
-     * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
-     * different thread.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD    = 908,
-
-    /**
-     * This indicates that an unknown internal error has occurred.
-     */
-    CUDA_ERROR_UNKNOWN                        = 999
+typedef enum cudaError_enum
+{
+  /**
+   * The API call returned with no errors. In the case of query calls, this
+   * also means that the operation being queried is complete (see
+   * ::cuEventQuery() and ::cuStreamQuery()).
+   */
+  CUDA_SUCCESS                              = 0,
+
+  /**
+   * This indicates that one or more of the parameters passed to the API call
+   * is not within an acceptable range of values.
+   */
+  CUDA_ERROR_INVALID_VALUE                  = 1,
+
+  /**
+   * The API call failed because it was unable to allocate enough memory to
+   * perform the requested operation.
+   */
+  CUDA_ERROR_OUT_OF_MEMORY                  = 2,
+
+  /**
+   * This indicates that the CUDA driver has not been initialized with
+   * ::cuInit() or that initialization has failed.
+   */
+  CUDA_ERROR_NOT_INITIALIZED                = 3,
+
+  /**
+   * This indicates that the CUDA driver is in the process of shutting down.
+   */
+  CUDA_ERROR_DEINITIALIZED                  = 4,
+
+  /**
+   * This indicates profiler is not initialized for this run. This can
+   * happen when the application is running with external profiling tools
+   * like visual profiler.
+   */
+  CUDA_ERROR_PROFILER_DISABLED              = 5,
+
+  /**
+   * \deprecated
+   * This error return is deprecated as of CUDA 5.0. It is no longer an error
+   * to attempt to enable/disable the profiling via ::cuProfilerStart or
+   * ::cuProfilerStop without initialization.
+   */
+  CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
+
+  /**
+   * \deprecated
+   * This error return is deprecated as of CUDA 5.0. It is no longer an error
+   * to call cuProfilerStart() when profiling is already enabled.
+   */
+  CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
+
+  /**
+   * \deprecated
+   * This error return is deprecated as of CUDA 5.0. It is no longer an error
+   * to call cuProfilerStop() when profiling is already disabled.
+   */
+  CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
+
+  /**
+   * This indicates that no CUDA-capable devices were detected by the installed
+   * CUDA driver.
+   */
+  CUDA_ERROR_NO_DEVICE                      = 100,
+
+  /**
+   * This indicates that the device ordinal supplied by the user does not
+   * correspond to a valid CUDA device.
+   */
+  CUDA_ERROR_INVALID_DEVICE                 = 101,
+
+
+  /**
+   * This indicates that the device kernel image is invalid. This can also
+   * indicate an invalid CUDA module.
+   */
+  CUDA_ERROR_INVALID_IMAGE                  = 200,
+
+  /**
+   * This most frequently indicates that there is no context bound to the
+   * current thread. This can also be returned if the context passed to an
+   * API call is not a valid handle (such as a context that has had
+   * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+   * mixes different API versions (i.e. 3010 context with 3020 API calls).
+   * See ::cuCtxGetApiVersion() for more details.
+   */
+  CUDA_ERROR_INVALID_CONTEXT                = 201,
+
+  /**
+   * This indicated that the context being supplied as a parameter to the
+   * API call was already the active context.
+   * \deprecated
+   * This error return is deprecated as of CUDA 3.2. It is no longer an
+   * error to attempt to push the active context via ::cuCtxPushCurrent().
+   */
+  CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
+
+  /**
+   * This indicates that a map or register operation has failed.
+   */
+  CUDA_ERROR_MAP_FAILED                     = 205,
+
+  /**
+   * This indicates that an unmap or unregister operation has failed.
+   */
+  CUDA_ERROR_UNMAP_FAILED                   = 206,
+
+  /**
+   * This indicates that the specified array is currently mapped and thus
+   * cannot be destroyed.
+   */
+  CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
+
+  /**
+   * This indicates that the resource is already mapped.
+   */
+  CUDA_ERROR_ALREADY_MAPPED                 = 208,
+
+  /**
+   * This indicates that there is no kernel image available that is suitable
+   * for the device. This can occur when a user specifies code generation
+   * options for a particular CUDA source file that do not include the
+   * corresponding device configuration.
+   */
+  CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
+
+  /**
+   * This indicates that a resource has already been acquired.
+   */
+  CUDA_ERROR_ALREADY_ACQUIRED               = 210,
+
+  /**
+   * This indicates that a resource is not mapped.
+   */
+  CUDA_ERROR_NOT_MAPPED                     = 211,
+
+  /**
+   * This indicates that a mapped resource is not available for access as an
+   * array.
+   */
+  CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
+
+  /**
+   * This indicates that a mapped resource is not available for access as a
+   * pointer.
+   */
+  CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
+
+  /**
+   * This indicates that an uncorrectable ECC error was detected during
+   * execution.
+   */
+  CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
+
+  /**
+   * This indicates that the ::CUlimit passed to the API call is not
+   * supported by the active device.
+   */
+  CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
+
+  /**
+   * This indicates that the ::CUcontext passed to the API call can
+   * only be bound to a single CPU thread at a time but is already
+   * bound to a CPU thread.
+   */
+  CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
+
+  /**
+   * This indicates that peer access is not supported across the given
+   * devices.
+   */
+  CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,
+
+  /**
+   * This indicates that a PTX JIT compilation failed.
+   */
+  CUDA_ERROR_INVALID_PTX                    = 218,
+
+  /**
+   * This indicates an error with OpenGL or DirectX context.
+   */
+  CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,
+
+  /**
+  * This indicates that an uncorrectable NVLink error was detected during the
+  * execution.
+  */
+  CUDA_ERROR_NVLINK_UNCORRECTABLE           = 220,
+
+  /**
+  * This indicates that the PTX JIT compiler library was not found.
+  */
+  CUDA_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
+
+  /**
+   * This indicates that the device kernel source is invalid.
+   */
+  CUDA_ERROR_INVALID_SOURCE                 = 300,
+
+  /**
+   * This indicates that the file specified was not found.
+   */
+  CUDA_ERROR_FILE_NOT_FOUND                 = 301,
+
+  /**
+   * This indicates that a link to a shared object failed to resolve.
+   */
+  CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
+
+  /**
+   * This indicates that initialization of a shared object failed.
+   */
+  CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
+
+  /**
+   * This indicates that an OS call failed.
+   */
+  CUDA_ERROR_OPERATING_SYSTEM               = 304,
+
+  /**
+   * This indicates that a resource handle passed to the API call was not
+   * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
+   */
+  CUDA_ERROR_INVALID_HANDLE                 = 400,
+
+  /**
+   * This indicates that a resource required by the API call is not in a
+   * valid state to perform the requested operation.
+   */
+  CUDA_ERROR_ILLEGAL_STATE                  = 401,
+
+  /**
+   * This indicates that a named symbol was not found. Examples of symbols
+   * are global/constant variable names, texture names, and surface names.
+   */
+  CUDA_ERROR_NOT_FOUND                      = 500,
+
+  /**
+   * This indicates that asynchronous operations issued previously have not
+   * completed yet. This result is not actually an error, but must be indicated
+   * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
+   * may return this value include ::cuEventQuery() and ::cuStreamQuery().
+   */
+  CUDA_ERROR_NOT_READY                      = 600,
+
+  /**
+   * While executing a kernel, the device encountered a
+   * load or store instruction on an invalid memory address.
+   * This leaves the process in an inconsistent state and any further CUDA work
+   * will return the same error. To continue using CUDA, the process must be terminated
+   * and relaunched.
+   */
+  CUDA_ERROR_ILLEGAL_ADDRESS                = 700,
+
+  /**
+   * This indicates that a launch did not occur because it did not have
+   * appropriate resources. This error usually indicates that the user has
+   * attempted to pass too many arguments to the device kernel, or the
+   * kernel launch specifies too many threads for the kernel's register
+   * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+   * when a 32-bit int is expected) is equivalent to passing too many
+   * arguments and can also result in this error.
+   */
+  CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
+
+  /**
+   * This indicates that the device kernel took too long to execute. This can
+   * only occur if timeouts are enabled - see the device attribute
+   * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
+   * This leaves the process in an inconsistent state and any further CUDA work
+   * will return the same error. To continue using CUDA, the process must be terminated
+   * and relaunched.
+   */
+  CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
+
+  /**
+   * This error indicates a kernel launch that uses an incompatible texturing
+   * mode.
+   */
+  CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
+
+  /**
+   * This error indicates that a call to ::cuCtxEnablePeerAccess() is
+   * trying to re-enable peer access to a context which has already
+   * had peer access to it enabled.
+   */
+  CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,
+
+  /**
+   * This error indicates that ::cuCtxDisablePeerAccess() is
+   * trying to disable peer access which has not been enabled yet
+   * via ::cuCtxEnablePeerAccess().
+   */
+  CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,
+
+  /**
+   * This error indicates that the primary context for the specified device
+   * has already been initialized.
+   */
+  CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
+
+  /**
+   * This error indicates that the context current to the calling thread
+   * has been destroyed using ::cuCtxDestroy, or is a primary context which
+   * has not yet been initialized.
+   */
+  CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
+
+  /**
+   * A device-side assert triggered during kernel execution. The context
+   * cannot be used anymore, and must be destroyed. All existing device
+   * memory allocations from this context are invalid and must be
+   * reconstructed if the program is to continue using CUDA.
+   */
+  CUDA_ERROR_ASSERT                         = 710,
+
+  /**
+   * This error indicates that the hardware resources required to enable
+   * peer access have been exhausted for one or more of the devices
+   * passed to ::cuCtxEnablePeerAccess().
+   */
+  CUDA_ERROR_TOO_MANY_PEERS                 = 711,
+
+  /**
+   * This error indicates that the memory range passed to ::cuMemHostRegister()
+   * has already been registered.
+   */
+  CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
+
+  /**
+   * This error indicates that the pointer passed to ::cuMemHostUnregister()
+   * does not correspond to any currently registered memory region.
+   */
+  CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,
+
+  /**
+   * While executing a kernel, the device encountered a stack error.
+   * This can be due to stack corruption or exceeding the stack size limit.
+   * This leaves the process in an inconsistent state and any further CUDA work
+   * will return the same error. To continue using CUDA, the process must be terminated
+   * and relaunched.
+   */
+  CUDA_ERROR_HARDWARE_STACK_ERROR           = 714,
+
+  /**
+   * While executing a kernel, the device encountered an illegal instruction.
+   * This leaves the process in an inconsistent state and any further CUDA work
+   * will return the same error. To continue using CUDA, the process must be terminated
+   * and relaunched.
+   */
+  CUDA_ERROR_ILLEGAL_INSTRUCTION            = 715,
+
+  /**
+   * While executing a kernel, the device encountered a load or store instruction
+   * on a memory address which is not aligned.
+   * This leaves the process in an inconsistent state and any further CUDA work
+   * will return the same error. To continue using CUDA, the process must be terminated
+   * and relaunched.
+   */
+  CUDA_ERROR_MISALIGNED_ADDRESS             = 716,
+
+  /**
+   * While executing a kernel, the device encountered an instruction
+   * which can only operate on memory locations in certain address spaces
+   * (global, shared, or local), but was supplied a memory address not
+   * belonging to an allowed address space.
+   * This leaves the process in an inconsistent state and any further CUDA work
+   * will return the same error. To continue using CUDA, the process must be terminated
+   * and relaunched.
+   */
+  CUDA_ERROR_INVALID_ADDRESS_SPACE          = 717,
+
+  /**
+   * While executing a kernel, the device program counter wrapped its address space.
+   * This leaves the process in an inconsistent state and any further CUDA work
+   * will return the same error. To continue using CUDA, the process must be terminated
+   * and relaunched.
+   */
+  CUDA_ERROR_INVALID_PC                     = 718,
+
+  /**
+   * An exception occurred on the device while executing a kernel. Common
+   * causes include dereferencing an invalid device pointer and accessing
+   * out of bounds shared memory. Less common cases can be system specific - more
+   * information about these cases can be found in the system specific user guide.
+   * This leaves the process in an inconsistent state and any further CUDA work
+   * will return the same error. To continue using CUDA, the process must be terminated
+   * and relaunched.
+   */
+  CUDA_ERROR_LAUNCH_FAILED                  = 719,
+
+  /**
+   * This error indicates that the number of blocks launched per grid for a kernel that was
+   * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
+   * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
+   * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+   * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+   */
+  CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
+
+  /**
+   * This error indicates that the attempted operation is not permitted.
+   */
+  CUDA_ERROR_NOT_PERMITTED                  = 800,
+
+  /**
+   * This error indicates that the attempted operation is not supported
+   * on the current system or device.
+   */
+  CUDA_ERROR_NOT_SUPPORTED                  = 801,
+
+  /**
+   * This error indicates that the system is not yet ready to start any CUDA
+   * work.  To continue using CUDA, verify the system configuration is in a
+   * valid state and all required driver daemons are actively running.
+   * More information about this error can be found in the system specific
+   * user guide.
+   */
+  CUDA_ERROR_SYSTEM_NOT_READY               = 802,
+
+  /**
+   * This error indicates that there is a mismatch between the versions of
+   * the display driver and the CUDA driver. Refer to the compatibility documentation
+   * for supported versions.
+   */
+  CUDA_ERROR_SYSTEM_DRIVER_MISMATCH         = 803,
+
+  /**
+   * This error indicates that the system was upgraded to run with forward compatibility
+   * but the visible hardware detected by CUDA does not support this configuration.
+   * Refer to the compatibility documentation for the supported hardware matrix or ensure
+   * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+   * environment variable.
+   */
+  CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
+
+  /**
+   * This error indicates that the operation is not permitted when
+   * the stream is capturing.
+   */
+  CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED     = 900,
+
+  /**
+   * This error indicates that the current capture sequence on the stream
+   * has been invalidated due to a previous error.
+   */
+  CUDA_ERROR_STREAM_CAPTURE_INVALIDATED     = 901,
+
+  /**
+   * This error indicates that the operation would have resulted in a merge
+   * of two independent capture sequences.
+   */
+  CUDA_ERROR_STREAM_CAPTURE_MERGE           = 902,
+
+  /**
+   * This error indicates that the capture was not initiated in this stream.
+   */
+  CUDA_ERROR_STREAM_CAPTURE_UNMATCHED       = 903,
+
+  /**
+   * This error indicates that the capture sequence contains a fork that was
+   * not joined to the primary stream.
+   */
+  CUDA_ERROR_STREAM_CAPTURE_UNJOINED        = 904,
+
+  /**
+   * This error indicates that a dependency would have been created which
+   * crosses the capture sequence boundary. Only implicit in-stream ordering
+   * dependencies are allowed to cross the boundary.
+   */
+  CUDA_ERROR_STREAM_CAPTURE_ISOLATION       = 905,
+
+  /**
+   * This error indicates a disallowed implicit dependency on a current capture
+   * sequence from cudaStreamLegacy.
+   */
+  CUDA_ERROR_STREAM_CAPTURE_IMPLICIT        = 906,
+
+  /**
+   * This error indicates that the operation is not permitted on an event which
+   * was last recorded in a capturing stream.
+   */
+  CUDA_ERROR_CAPTURED_EVENT                 = 907,
+
+  /**
+   * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
+   * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
+   * different thread.
+   */
+  CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD    = 908,
+
+  /**
+   * This indicates that an unknown internal error has occurred.
+   */
+  CUDA_ERROR_UNKNOWN                        = 999
+
 } CUresult;
 
 /**
@@ -537,454 +539,469 @@ typedef enum cudaError_enum {
  */
 typedef enum CUjit_option_enum
 {
-    /**
-     * Max number of registers that a thread may use.\n
-     * Option type: unsigned int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_MAX_REGISTERS = 0,
+  /**
+   * Max number of registers that a thread may use.\n
+   * Option type: unsigned int\n
+   * Applies to: compiler only
+   */
+  CU_JIT_MAX_REGISTERS = 0,
 
-    /**
-     * IN: Specifies minimum number of threads per block to target compilation
-     * for\n
-     * OUT: Returns the number of threads the compiler actually targeted.
-     * This restricts the resource utilization fo the compiler (e.g. max
-     * registers) such that a block with the given number of threads should be
-     * able to launch based on register limitations. Note, this option does not
-     * currently take into account any other resource limitations, such as
-     * shared memory utilization.\n
-     * Cannot be combined with ::CU_JIT_TARGET.\n
-     * Option type: unsigned int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_THREADS_PER_BLOCK,
+  /**
+   * IN: Specifies minimum number of threads per block to target compilation
+   * for\n
+   * OUT: Returns the number of threads the compiler actually targeted.
+   * This restricts the resource utilization fo the compiler (e.g. max
+   * registers) such that a block with the given number of threads should be
+   * able to launch based on register limitations. Note, this option does not
+   * currently take into account any other resource limitations, such as
+   * shared memory utilization.\n
+   * Cannot be combined with ::CU_JIT_TARGET.\n
+   * Option type: unsigned int\n
+   * Applies to: compiler only
+   */
+  CU_JIT_THREADS_PER_BLOCK,
 
-    /**
-     * Overwrites the option value with the total wall clock time, in
-     * milliseconds, spent in the compiler and linker\n
-     * Option type: float\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_WALL_TIME,
+  /**
+   * Overwrites the option value with the total wall clock time, in
+   * milliseconds, spent in the compiler and linker\n
+   * Option type: float\n
+   * Applies to: compiler and linker
+   */
+  CU_JIT_WALL_TIME,
 
-    /**
-     * Pointer to a buffer in which to print any log messages
-     * that are informational in nature (the buffer size is specified via
-     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
-     * Option type: char *\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_INFO_LOG_BUFFER,
+  /**
+   * Pointer to a buffer in which to print any log messages
+   * that are informational in nature (the buffer size is specified via
+   * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
+   * Option type: char *\n
+   * Applies to: compiler and linker
+   */
+  CU_JIT_INFO_LOG_BUFFER,
 
-    /**
-     * IN: Log buffer size in bytes.  Log messages will be capped at this size
-     * (including null terminator)\n
-     * OUT: Amount of log buffer filled with messages\n
-     * Option type: unsigned int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+  /**
+   * IN: Log buffer size in bytes.  Log messages will be capped at this size
+   * (including null terminator)\n
+   * OUT: Amount of log buffer filled with messages\n
+   * Option type: unsigned int\n
+   * Applies to: compiler and linker
+   */
+  CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
 
-    /**
-     * Pointer to a buffer in which to print any log messages that
-     * reflect errors (the buffer size is specified via option
-     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
-     * Option type: char *\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_ERROR_LOG_BUFFER,
+  /**
+   * Pointer to a buffer in which to print any log messages that
+   * reflect errors (the buffer size is specified via option
+   * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
+   * Option type: char *\n
+   * Applies to: compiler and linker
+   */
+  CU_JIT_ERROR_LOG_BUFFER,
 
-    /**
-     * IN: Log buffer size in bytes.  Log messages will be capped at this size
-     * (including null terminator)\n
-     * OUT: Amount of log buffer filled with messages\n
-     * Option type: unsigned int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+  /**
+   * IN: Log buffer size in bytes.  Log messages will be capped at this size
+   * (including null terminator)\n
+   * OUT: Amount of log buffer filled with messages\n
+   * Option type: unsigned int\n
+   * Applies to: compiler and linker
+   */
+  CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
 
-    /**
-     * Level of optimizations to apply to generated code (0 - 4), with 4
-     * being the default and highest level of optimizations.\n
-     * Option type: unsigned int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_OPTIMIZATION_LEVEL,
+  /**
+   * Level of optimizations to apply to generated code (0 - 4), with 4
+   * being the default and highest level of optimizations.\n
+   * Option type: unsigned int\n
+   * Applies to: compiler only
+   */
+  CU_JIT_OPTIMIZATION_LEVEL,
 
-    /**
-     * No option value required. Determines the target based on the current
-     * attached context (default)\n
-     * Option type: No option value needed\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_TARGET_FROM_CUCONTEXT,
+  /**
+   * No option value required. Determines the target based on the current
+   * attached context (default)\n
+   * Option type: No option value needed\n
+   * Applies to: compiler and linker
+   */
+  CU_JIT_TARGET_FROM_CUCONTEXT,
 
-    /**
-     * Target is chosen based on supplied ::CUjit_target.  Cannot be
-     * combined with ::CU_JIT_THREADS_PER_BLOCK.\n
-     * Option type: unsigned int for enumerated type ::CUjit_target\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_TARGET,
+  /**
+   * Target is chosen based on supplied ::CUjit_target.  Cannot be
+   * combined with ::CU_JIT_THREADS_PER_BLOCK.\n
+   * Option type: unsigned int for enumerated type ::CUjit_target\n
+   * Applies to: compiler and linker
+   */
+  CU_JIT_TARGET,
 
-    /**
-     * Specifies choice of fallback strategy if matching cubin is not found.
-     * Choice is based on supplied ::CUjit_fallback.  This option cannot be
-     * used with cuLink* APIs as the linker requires exact matches.\n
-     * Option type: unsigned int for enumerated type ::CUjit_fallback\n
-     * Applies to: compiler only
-     */
-    CU_JIT_FALLBACK_STRATEGY,
+  /**
+   * Specifies choice of fallback strategy if matching cubin is not found.
+   * Choice is based on supplied ::CUjit_fallback.  This option cannot be
+   * used with cuLink* APIs as the linker requires exact matches.\n
+   * Option type: unsigned int for enumerated type ::CUjit_fallback\n
+   * Applies to: compiler only
+   */
+  CU_JIT_FALLBACK_STRATEGY,
 
-    /**
-     * Specifies whether to create debug information in output (-g)
-     * (0: false, default)\n
-     * Option type: int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_GENERATE_DEBUG_INFO,
+  /**
+   * Specifies whether to create debug information in output (-g)
+   * (0: false, default)\n
+   * Option type: int\n
+   * Applies to: compiler and linker
+   */
+  CU_JIT_GENERATE_DEBUG_INFO,
 
-    /**
-     * Generate verbose log messages (0: false, default)\n
-     * Option type: int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_LOG_VERBOSE,
+  /**
+   * Generate verbose log messages (0: false, default)\n
+   * Option type: int\n
+   * Applies to: compiler and linker
+   */
+  CU_JIT_LOG_VERBOSE,
 
-    /**
-     * Generate line number information (-lineinfo) (0: false, default)\n
-     * Option type: int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_GENERATE_LINE_INFO,
+  /**
+   * Generate line number information (-lineinfo) (0: false, default)\n
+   * Option type: int\n
+   * Applies to: compiler only
+   */
+  CU_JIT_GENERATE_LINE_INFO,
 
-    /**
-     * Specifies whether to enable caching explicitly (-dlcm) \n
-     * Choice is based on supplied ::CUjit_cacheMode_enum.\n
-     * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
-     * Applies to: compiler only
-     */
-    CU_JIT_CACHE_MODE,
+  /**
+   * Specifies whether to enable caching explicitly (-dlcm) \n
+   * Choice is based on supplied ::CUjit_cacheMode_enum.\n
+   * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
+   * Applies to: compiler only
+   */
+  CU_JIT_CACHE_MODE,
 
-    /**
-     * The below jit options are used for internal purposes only, in this version of CUDA
-     */
-    CU_JIT_NEW_SM3X_OPT,
-    CU_JIT_FAST_COMPILE,
+  /**
+   * The below jit options are used for internal purposes only, in this version of CUDA
+   */
+  CU_JIT_NEW_SM3X_OPT,
+  CU_JIT_FAST_COMPILE,
 
-    /**
-     * Array of device symbol names that will be relocated to the corresponing
-     * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
-     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
-     * When loding a device module, driver will relocate all encountered
-     * unresolved symbols to the host addresses.\n
-     * It is only allowed to register symbols that correspond to unresolved
-     * global variables.\n
-     * It is illegal to register the same device symbol at multiple addresses.\n
-     * Option type: const char **\n
-     * Applies to: dynamic linker only
-     */
-    CU_JIT_GLOBAL_SYMBOL_NAMES,
+  /**
+   * Array of device symbol names that will be relocated to the corresponing
+   * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
+   * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+   * When loding a device module, driver will relocate all encountered
+   * unresolved symbols to the host addresses.\n
+   * It is only allowed to register symbols that correspond to unresolved
+   * global variables.\n
+   * It is illegal to register the same device symbol at multiple addresses.\n
+   * Option type: const char **\n
+   * Applies to: dynamic linker only
+   */
+  CU_JIT_GLOBAL_SYMBOL_NAMES,
 
-    /**
-     * Array of host addresses that will be used to relocate corresponding
-     * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
-     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
-     * Option type: void **\n
-     * Applies to: dynamic linker only
-     */
-    CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
+  /**
+   * Array of host addresses that will be used to relocate corresponding
+   * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
+   * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+   * Option type: void **\n
+   * Applies to: dynamic linker only
+   */
+  CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
 
-    /**
-     * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
-     * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
-     * Option type: unsigned int\n
-     * Applies to: dynamic linker only
-     */
-    CU_JIT_GLOBAL_SYMBOL_COUNT,
+  /**
+   * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
+   * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
+   * Option type: unsigned int\n
+   * Applies to: dynamic linker only
+   */
+  CU_JIT_GLOBAL_SYMBOL_COUNT,
 
-    CU_JIT_NUM_OPTIONS
+  CU_JIT_NUM_OPTIONS
 
 } CUjit_option;
 
 /**
  * Device properties
  */
-typedef enum CUdevice_attribute_enum {
-    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                    /**< Maximum block dimension Z */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                     /**< Maximum grid dimension X */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                     /**< Maximum grid dimension Y */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                     /**< Maximum grid dimension Z */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,        /**< Maximum shared memory available per block in bytes */
-    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
-    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
-    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                         /**< Warp size in threads */
-    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                         /**< Maximum pitch in bytes allowed by memory copies */
-    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,           /**< Maximum number of 32-bit registers available per block */
-    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
-    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                        /**< Typical clock frequency in kilohertz */
-    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                 /**< Alignment requirement for textures */
-    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                       /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
-    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,              /**< Number of multiprocessors on device */
-    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,               /**< Specifies whether there is a run time limit on kernels */
-    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                        /**< Device is integrated with host memory */
-    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,               /**< Device can map host memory into CUDA address space */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                      /**< Compute mode (See ::CUcomputemode for details) */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,           /**< Maximum 1D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,           /**< Maximum 2D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,          /**< Maximum 2D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,           /**< Maximum 3D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,          /**< Maximum 3D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,           /**< Maximum 3D texture depth */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,   /**< Maximum 2D layered texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,  /**< Maximum 2D layered texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,  /**< Maximum layers in a 2D layered texture */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,     /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,    /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
-    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                 /**< Alignment requirement for surfaces */
-    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                /**< Device can possibly execute multiple kernels concurrently */
-    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                       /**< Device has ECC support enabled */
-    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                        /**< PCI bus ID of the device */
-    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                     /**< PCI device ID of the device */
-    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                        /**< Device is using TCC driver model */
-    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
-    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,           /**< Global memory bus width in bits */
-    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                     /**< Size of L2 cache in bytes */
-    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,    /**< Maximum resident threads per multiprocessor */
-    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                /**< Number of asynchronous engines */
-    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                /**< Device shares a unified address space with the host */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,   /**< Maximum 1D layered texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,  /**< Maximum layers in a 1D layered texture */
-    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                  /**< Deprecated, do not use. */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,    /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,   /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */
-    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                     /**< PCI domain ID of the device */
-    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,           /**< Pitch alignment requirement for textures */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,      /**< Maximum cubemap texture width/height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,  /**< Maximum cubemap layered texture width/height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,           /**< Maximum 1D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,           /**< Maximum 2D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,          /**< Maximum 2D surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,           /**< Maximum 3D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,          /**< Maximum 3D surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,           /**< Maximum 3D surface depth */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,   /**< Maximum 1D layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,  /**< Maximum layers in a 1D layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,   /**< Maximum 2D layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,  /**< Maximum 2D layered surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,  /**< Maximum layers in a 2D layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,      /**< Maximum cubemap surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,  /**< Maximum cubemap layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,    /**< Maximum 1D linear texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,    /**< Maximum 2D linear texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,   /**< Maximum 2D linear texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,    /**< Maximum 2D linear texture pitch in bytes */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,          /**< Major compute capability version number */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,          /**< Minor compute capability version number */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */
-    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,       /**< Device supports stream priorities */
-    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,         /**< Device supports caching globals in L1 */
-    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,          /**< Device supports caching locals in L1 */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,  /**< Maximum shared memory available per multiprocessor in bytes */
-    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,  /**< Maximum number of 32-bit registers available per multiprocessor */
-    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                    /**< Device can allocate managed memory on this system */
-    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                    /**< Device is on a multi-GPU board */
-    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,           /**< Unique id for a group of devices on the same multi-GPU board */
-    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,       /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
-    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,  /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
-    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,            /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
-    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,         /**< Device can coherently access managed memory concurrently with the CPU */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,      /**< Device supports compute preemption. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,            /**< ::cuStreamBatchMemOp and related APIs are supported. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,     /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,     /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */
-    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
-    CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,   /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */
-    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,           /**< Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
-    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,           /**< Device supports host memory registration via ::cudaHostRegister. */
-    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
-    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */
-    CU_DEVICE_ATTRIBUTE_MAX
+typedef enum CUdevice_attribute_enum
+{
+  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                    /**< Maximum block dimension Z */
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                     /**< Maximum grid dimension X */
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                     /**< Maximum grid dimension Y */
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                     /**< Maximum grid dimension Z */
+  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,        /**< Maximum shared memory available per block in bytes */
+  CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+  CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                         /**< Warp size in threads */
+  CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                         /**< Maximum pitch in bytes allowed by memory copies */
+  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,           /**< Maximum number of 32-bit registers available per block */
+  CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+  CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                        /**< Typical clock frequency in kilohertz */
+  CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                 /**< Alignment requirement for textures */
+  CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                       /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
+  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,              /**< Number of multiprocessors on device */
+  CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,               /**< Specifies whether there is a run time limit on kernels */
+  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                        /**< Device is integrated with host memory */
+  CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,               /**< Device can map host memory into CUDA address space */
+  CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                      /**< Compute mode (See ::CUcomputemode for details) */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,           /**< Maximum 1D texture width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,           /**< Maximum 2D texture width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,          /**< Maximum 2D texture height */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,           /**< Maximum 3D texture width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,          /**< Maximum 3D texture height */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,           /**< Maximum 3D texture depth */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,   /**< Maximum 2D layered texture width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,  /**< Maximum 2D layered texture height */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,  /**< Maximum layers in a 2D layered texture */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,     /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,    /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
+  CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                 /**< Alignment requirement for surfaces */
+  CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                /**< Device can possibly execute multiple kernels concurrently */
+  CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                       /**< Device has ECC support enabled */
+  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                        /**< PCI bus ID of the device */
+  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                     /**< PCI device ID of the device */
+  CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                        /**< Device is using TCC driver model */
+  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
+  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,           /**< Global memory bus width in bits */
+  CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                     /**< Size of L2 cache in bytes */
+  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,    /**< Maximum resident threads per multiprocessor */
+  CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                /**< Number of asynchronous engines */
+  CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                /**< Device shares a unified address space with the host */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,   /**< Maximum 1D layered texture width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,  /**< Maximum layers in a 1D layered texture */
+  CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                  /**< Deprecated, do not use. */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,    /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,   /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */
+  CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                     /**< PCI domain ID of the device */
+  CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,           /**< Pitch alignment requirement for textures */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,      /**< Maximum cubemap texture width/height */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,  /**< Maximum cubemap layered texture width/height */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,           /**< Maximum 1D surface width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,           /**< Maximum 2D surface width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,          /**< Maximum 2D surface height */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,           /**< Maximum 3D surface width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,          /**< Maximum 3D surface height */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,           /**< Maximum 3D surface depth */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,   /**< Maximum 1D layered surface width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,  /**< Maximum layers in a 1D layered surface */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,   /**< Maximum 2D layered surface width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,  /**< Maximum 2D layered surface height */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,  /**< Maximum layers in a 2D layered surface */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,      /**< Maximum cubemap surface width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,  /**< Maximum cubemap layered surface width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,    /**< Maximum 1D linear texture width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,    /**< Maximum 2D linear texture width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,   /**< Maximum 2D linear texture height */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,    /**< Maximum 2D linear texture pitch in bytes */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,          /**< Major compute capability version number */
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,          /**< Minor compute capability version number */
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */
+  CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,       /**< Device supports stream priorities */
+  CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,         /**< Device supports caching globals in L1 */
+  CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,          /**< Device supports caching locals in L1 */
+  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,  /**< Maximum shared memory available per multiprocessor in bytes */
+  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,  /**< Maximum number of 32-bit registers available per multiprocessor */
+  CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                    /**< Device can allocate managed memory on this system */
+  CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                    /**< Device is on a multi-GPU board */
+  CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,           /**< Unique id for a group of devices on the same multi-GPU board */
+  CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,       /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
+  CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,  /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,            /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+  CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,         /**< Device can coherently access managed memory concurrently with the CPU */
+  CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,      /**< Device supports compute preemption. */
+  CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */
+  CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,            /**< ::cuStreamBatchMemOp and related APIs are supported. */
+  CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,     /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
+  CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,     /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */
+  CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
+  CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,   /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */
+  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */
+  CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,           /**< Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
+  CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,           /**< Device supports host memory registration via ::cudaHostRegister. */
+  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
+  CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */
+  CU_DEVICE_ATTRIBUTE_MAX
+
 } CUdevice_attribute;
 
 /**
  * Function cache configurations
  */
-typedef enum CUfunc_cache_enum {
-    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
-    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
-    CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
-    CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
+typedef enum CUfunc_cache_enum
+{
+  CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
+  CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
+  CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
+  CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
+
 } CUfunc_cache;
 
 /**
  * Shared memory configurations
  */
-typedef enum CUsharedconfig_enum {
-    CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
-    CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
-    CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
+typedef enum CUsharedconfig_enum
+{
+  CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
+  CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
+  CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
+
 } CUsharedconfig;
 
 /**
  * Function properties
  */
-typedef enum CUfunction_attribute_enum {
-    /**
-     * The maximum number of threads per block, beyond which a launch of the
-     * function would fail. This number depends on both the function and the
-     * device on which the function is currently loaded.
-     */
-    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+typedef enum CUfunction_attribute_enum
+{
+  /**
+   * The maximum number of threads per block, beyond which a launch of the
+   * function would fail. This number depends on both the function and the
+   * device on which the function is currently loaded.
+   */
+  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
 
-    /**
-     * The size in bytes of statically-allocated shared memory required by
-     * this function. This does not include dynamically-allocated shared
-     * memory requested by the user at runtime.
-     */
-    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+  /**
+   * The size in bytes of statically-allocated shared memory required by
+   * this function. This does not include dynamically-allocated shared
+   * memory requested by the user at runtime.
+   */
+  CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
 
-    /**
-     * The size in bytes of user-allocated constant memory required by this
-     * function.
-     */
-    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+  /**
+   * The size in bytes of user-allocated constant memory required by this
+   * function.
+   */
+  CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
 
-    /**
-     * The size in bytes of local memory used by each thread of this function.
-     */
-    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+  /**
+   * The size in bytes of local memory used by each thread of this function.
+   */
+  CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
 
-    /**
-     * The number of registers used by each thread of this function.
-     */
-    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+  /**
+   * The number of registers used by each thread of this function.
+   */
+  CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
 
-    /**
-     * The PTX virtual architecture version for which the function was
-     * compiled. This value is the major PTX version * 10 + the minor PTX
-     * version, so a PTX version 1.3 function would return the value 13.
-     * Note that this may return the undefined value of 0 for cubins
-     * compiled prior to CUDA 3.0.
-     */
-    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+  /**
+   * The PTX virtual architecture version for which the function was
+   * compiled. This value is the major PTX version * 10 + the minor PTX
+   * version, so a PTX version 1.3 function would return the value 13.
+   * Note that this may return the undefined value of 0 for cubins
+   * compiled prior to CUDA 3.0.
+   */
+  CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
 
-    /**
-     * The binary architecture version for which the function was compiled.
-     * This value is the major binary version * 10 + the minor binary version,
-     * so a binary version 1.3 function would return the value 13. Note that
-     * this will return a value of 10 for legacy cubins that do not have a
-     * properly-encoded binary architecture version.
-     */
-    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+  /**
+   * The binary architecture version for which the function was compiled.
+   * This value is the major binary version * 10 + the minor binary version,
+   * so a binary version 1.3 function would return the value 13. Note that
+   * this will return a value of 10 for legacy cubins that do not have a
+   * properly-encoded binary architecture version.
+   */
+  CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
 
-    /**
-     * The attribute to indicate whether the function has been compiled with
-     * user specified option "-Xptxas --dlcm=ca" set .
-     */
-    CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
+  /**
+   * The attribute to indicate whether the function has been compiled with
+   * user specified option "-Xptxas --dlcm=ca" set .
+   */
+  CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
 
-    /**
-     * The maximum size in bytes of dynamically-allocated shared memory that can be used by
-     * this function. If the user-specified dynamic shared memory size is larger than this
-     * value, the launch will fail.
-     * See ::cuFuncSetAttribute
-     */
-    CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+  /**
+   * The maximum size in bytes of dynamically-allocated shared memory that can be used by
+   * this function. If the user-specified dynamic shared memory size is larger than this
+   * value, the launch will fail.
+   * See ::cuFuncSetAttribute
+   */
+  CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
 
-    /**
-     * On devices where the L1 cache and shared memory use the same hardware resources,
-     * this sets the shared memory carveout preference, in percent of the total shared memory.
-     * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
-     * This is only a hint, and the driver can choose a different ratio if required to execute the function.
-     * See ::cuFuncSetAttribute
-     */
-    CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+  /**
+   * On devices where the L1 cache and shared memory use the same hardware resources,
+   * this sets the shared memory carveout preference, in percent of the total shared memory.
+   * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
+   * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+   * See ::cuFuncSetAttribute
+   */
+  CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+
+  CU_FUNC_ATTRIBUTE_MAX
 
-    CU_FUNC_ATTRIBUTE_MAX
 } CUfunction_attribute;
 
 /**
  * Context creation flags
  */
-typedef enum CUctx_flags_enum {
-    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
-    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
-    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
-    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
-    CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling
-                                         *  \deprecated This flag was deprecated as of CUDA 4.0
-                                         *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
-    CU_CTX_SCHED_MASK          = 0x07,
-    CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
-    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
-    CU_CTX_FLAGS_MASK          = 0x1f
+typedef enum CUctx_flags_enum
+{
+  CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
+  CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
+  CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
+  CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+  CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling
+                                       *  \deprecated This flag was deprecated as of CUDA 4.0
+                                       *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
+  CU_CTX_SCHED_MASK          = 0x07,
+  CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
+  CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
+  CU_CTX_FLAGS_MASK          = 0x1f
+
 } CUctx_flags;
 
 /**
  * Stream creation flags
  */
-typedef enum CUstream_flags_enum {
-    CU_STREAM_DEFAULT      = 0x0, /**< Default stream flag */
-    CU_STREAM_NON_BLOCKING = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+typedef enum CUstream_flags_enum
+{
+  CU_STREAM_DEFAULT      = 0x0, /**< Default stream flag */
+  CU_STREAM_NON_BLOCKING = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+
 } CUstream_flags;
 
 /**
  * Event creation flags
  */
-typedef enum CUevent_flags_enum {
-    CU_EVENT_DEFAULT        = 0x0, /**< Default event flag */
-    CU_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
-    CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
-    CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
+typedef enum CUevent_flags_enum
+{
+  CU_EVENT_DEFAULT        = 0x0, /**< Default event flag */
+  CU_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
+  CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
+  CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
+
 } CUevent_flags;
 
 typedef enum CUjitInputType_enum
 {
-    /**
-     * Compiled device-class-specific device code\n
-     * Applicable options: none
-     */
-    CU_JIT_INPUT_CUBIN = 0,
+  /**
+   * Compiled device-class-specific device code\n
+   * Applicable options: none
+   */
+  CU_JIT_INPUT_CUBIN = 0,
 
-    /**
-     * PTX source code\n
-     * Applicable options: PTX compiler options
-     */
-    CU_JIT_INPUT_PTX,
+  /**
+   * PTX source code\n
+   * Applicable options: PTX compiler options
+   */
+  CU_JIT_INPUT_PTX,
 
-    /**
-     * Bundle of multiple cubins and/or PTX of some device code\n
-     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
-     */
-    CU_JIT_INPUT_FATBINARY,
+  /**
+   * Bundle of multiple cubins and/or PTX of some device code\n
+   * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+   */
+  CU_JIT_INPUT_FATBINARY,
 
-    /**
-     * Host object with embedded device code\n
-     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
-     */
-    CU_JIT_INPUT_OBJECT,
+  /**
+   * Host object with embedded device code\n
+   * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+   */
+  CU_JIT_INPUT_OBJECT,
 
-    /**
-     * Archive of host objects with embedded device code\n
-     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
-     */
-    CU_JIT_INPUT_LIBRARY,
+  /**
+   * Archive of host objects with embedded device code\n
+   * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+   */
+  CU_JIT_INPUT_LIBRARY,
+
+  CU_JIT_NUM_INPUT_TYPES
 
-    CU_JIT_NUM_INPUT_TYPES
 } CUjitInputType;
 
 #ifdef _WIN32
@@ -1119,4 +1136,50 @@ typedef struct hc_cuda_lib
 
 typedef hc_cuda_lib_t CUDA_PTR;
 
+int  cuda_init                 (void *hashcat_ctx);
+void cuda_close                (void *hashcat_ctx);
+
+int hc_cuCtxCreate             (void *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev);
+int hc_cuCtxDestroy            (void *hashcat_ctx, CUcontext ctx);
+int hc_cuCtxSetCurrent         (void *hashcat_ctx, CUcontext ctx);
+int hc_cuCtxSetCacheConfig     (void *hashcat_ctx, CUfunc_cache config);
+int hc_cuCtxSynchronize        (void *hashcat_ctx);
+int hc_cuDeviceGetAttribute    (void *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev);
+int hc_cuDeviceGetCount        (void *hashcat_ctx, int *count);
+int hc_cuDeviceGet             (void *hashcat_ctx, CUdevice *device, int ordinal);
+int hc_cuDeviceGetName         (void *hashcat_ctx, char *name, int len, CUdevice dev);
+int hc_cuDeviceTotalMem        (void *hashcat_ctx, size_t *bytes, CUdevice dev);
+int hc_cuDriverGetVersion      (void *hashcat_ctx, int *driverVersion);
+int hc_cuEventCreate           (void *hashcat_ctx, CUevent *phEvent, unsigned int Flags);
+int hc_cuEventDestroy          (void *hashcat_ctx, CUevent hEvent);
+int hc_cuEventElapsedTime      (void *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd);
+int hc_cuEventQuery            (void *hashcat_ctx, CUevent hEvent);
+int hc_cuEventRecord           (void *hashcat_ctx, CUevent hEvent, CUstream hStream);
+int hc_cuEventSynchronize      (void *hashcat_ctx, CUevent hEvent);
+int hc_cuFuncGetAttribute      (void *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+int hc_cuFuncSetAttribute      (void *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value);
+int hc_cuInit                  (void *hashcat_ctx, unsigned int Flags);
+int hc_cuLaunchKernel          (void *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+int hc_cuMemAlloc              (void *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
+int hc_cuMemcpyDtoDAsync       (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+int hc_cuMemcpyDtoHAsync       (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+int hc_cuMemcpyHtoDAsync       (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+int hc_cuMemFree               (void *hashcat_ctx, CUdeviceptr dptr);
+int hc_cuMemGetInfo            (void *hashcat_ctx, size_t *free, size_t *total);
+int hc_cuMemsetD32Async        (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+int hc_cuMemsetD8Async         (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+int hc_cuModuleGetFunction     (void *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
+int hc_cuModuleGetGlobal       (void *hashcat_ctx, CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
+int hc_cuModuleLoadDataEx      (void *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+int hc_cuModuleUnload          (void *hashcat_ctx, CUmodule hmod);
+int hc_cuStreamCreate          (void *hashcat_ctx, CUstream *phStream, unsigned int Flags);
+int hc_cuStreamDestroy         (void *hashcat_ctx, CUstream hStream);
+int hc_cuStreamSynchronize     (void *hashcat_ctx, CUstream hStream);
+int hc_cuCtxPushCurrent        (void *hashcat_ctx, CUcontext ctx);
+int hc_cuCtxPopCurrent         (void *hashcat_ctx, CUcontext *pctx);
+int hc_cuLinkCreate            (void *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+int hc_cuLinkAddData           (void *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+int hc_cuLinkDestroy           (void *hashcat_ctx, CUlinkState state);
+int hc_cuLinkComplete          (void *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut);
+
 #endif // _EXT_CUDA_H
diff --git a/include/ext_nvrtc.h b/include/ext_nvrtc.h
index 21485c999..ee46fc35c 100644
--- a/include/ext_nvrtc.h
+++ b/include/ext_nvrtc.h
@@ -84,4 +84,16 @@ typedef hc_nvrtc_lib_t NVRTC_PTR;
 
 int nvrtc_make_options_array_from_string (char *string, char **options);
 
+int  nvrtc_init                (void *hashcat_ctx);
+void nvrtc_close               (void *hashcat_ctx);
+
+int hc_nvrtcCreateProgram      (void *hashcat_ctx, nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames);
+int hc_nvrtcDestroyProgram     (void *hashcat_ctx, nvrtcProgram *prog);
+int hc_nvrtcCompileProgram     (void *hashcat_ctx, nvrtcProgram prog, int numOptions, const char * const *options);
+int hc_nvrtcGetProgramLogSize  (void *hashcat_ctx, nvrtcProgram prog, size_t *logSizeRet);
+int hc_nvrtcGetProgramLog      (void *hashcat_ctx, nvrtcProgram prog, char *log);
+int hc_nvrtcGetPTXSize         (void *hashcat_ctx, nvrtcProgram prog, size_t *ptxSizeRet);
+int hc_nvrtcGetPTX             (void *hashcat_ctx, nvrtcProgram prog, char *ptx);
+int hc_nvrtcVersion            (void *hashcat_ctx, int *major, int *minor);
+
 #endif // _EXT_NVRTC_H
diff --git a/src/backend.c b/src/backend.c
index cbc81f84a..ab7b44825 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -755,1504 +755,6 @@ void generate_cached_kernel_amp_filename (const u32 attack_kern, char *cache_dir
   snprintf (cached_file, 255, "%s/kernels/amp_a%u.%s.kernel", cache_dir, attack_kern, device_name_chksum_amp_mp);
 }
 
-// NVRTC
-
-int nvrtc_init (hashcat_ctx_t *hashcat_ctx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
-
-  memset (nvrtc, 0, sizeof (NVRTC_PTR));
-
-  #if   defined (_WIN)
-  nvrtc->lib = hc_dlopen ("nvrtc.dll");
-
-  if (nvrtc->lib == NULL)
-  {
-    // super annoying: nvidia is using the CUDA version in nvrtc???.dll filename!
-    // however, the cuda version string comes from nvcuda.dll which is from nvidia driver, but
-    // the driver version and the installed CUDA toolkit version can be different, so it cannot be used as a reference.
-    // brute force to the rescue
-
-    char dllname[100];
-
-    for (int major = 20; major >= 9; major--) // older than 3.x do not ship _v2 functions anyway
-                                              // older than 7.x does not support sm 5.x
-                                              // older than 8.x does not have documentation archive online, no way to check if nvrtc support whatever we need
-                                              // older than 9.x is just a theoretical limit since we define 9.0 as the minimum required version
-    {
-      for (int minor = 20; minor >= 0; minor--)
-      {
-        snprintf (dllname, sizeof (dllname), "nvrtc64_%d%d.dll", major, minor);
-
-        nvrtc->lib = hc_dlopen (dllname);
-
-        if (nvrtc->lib) break;
-
-        snprintf (dllname, sizeof (dllname), "nvrtc64_%d%d_0.dll", major, minor);
-
-        nvrtc->lib = hc_dlopen (dllname);
-
-        if (nvrtc->lib) break;
-      }
-
-      if (nvrtc->lib) break;
-    }
-  }
-  #elif defined (__APPLE__)
-  nvrtc->lib = hc_dlopen ("nvrtc.dylib");
-  #elif defined (__CYGWIN__)
-  nvrtc->lib = hc_dlopen ("nvrtc.dll");
-  #else
-  nvrtc->lib = hc_dlopen ("libnvrtc.so");
-
-  if (nvrtc->lib == NULL) nvrtc->lib = hc_dlopen ("libnvrtc.so.1");
-  #endif
-
-  if (nvrtc->lib == NULL) return -1;
-
-  HC_LOAD_FUNC (nvrtc, nvrtcAddNameExpression,  NVRTC_NVRTCADDNAMEEXPRESSION, NVRTC, 1);
-  HC_LOAD_FUNC (nvrtc, nvrtcCompileProgram,     NVRTC_NVRTCCOMPILEPROGRAM,    NVRTC, 1);
-  HC_LOAD_FUNC (nvrtc, nvrtcCreateProgram,      NVRTC_NVRTCCREATEPROGRAM,     NVRTC, 1);
-  HC_LOAD_FUNC (nvrtc, nvrtcDestroyProgram,     NVRTC_NVRTCDESTROYPROGRAM,    NVRTC, 1);
-  HC_LOAD_FUNC (nvrtc, nvrtcGetLoweredName,     NVRTC_NVRTCGETLOWEREDNAME,    NVRTC, 1);
-  HC_LOAD_FUNC (nvrtc, nvrtcGetPTX,             NVRTC_NVRTCGETPTX,            NVRTC, 1);
-  HC_LOAD_FUNC (nvrtc, nvrtcGetPTXSize,         NVRTC_NVRTCGETPTXSIZE,        NVRTC, 1);
-  HC_LOAD_FUNC (nvrtc, nvrtcGetProgramLog,      NVRTC_NVRTCGETPROGRAMLOG,     NVRTC, 1);
-  HC_LOAD_FUNC (nvrtc, nvrtcGetProgramLogSize,  NVRTC_NVRTCGETPROGRAMLOGSIZE, NVRTC, 1);
-  HC_LOAD_FUNC (nvrtc, nvrtcGetErrorString,     NVRTC_NVRTCGETERRORSTRING,    NVRTC, 1);
-  HC_LOAD_FUNC (nvrtc, nvrtcVersion,            NVRTC_NVRTCVERSION,           NVRTC, 1);
-
-  return 0;
-}
-
-void nvrtc_close (hashcat_ctx_t *hashcat_ctx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
-
-  if (nvrtc)
-  {
-    if (nvrtc->lib)
-    {
-      hc_dlclose (nvrtc->lib);
-    }
-
-    hcfree (backend_ctx->nvrtc);
-
-    backend_ctx->nvrtc = NULL;
-  }
-}
-
-int hc_nvrtcCreateProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
-
-  const nvrtcResult NVRTC_err = nvrtc->nvrtcCreateProgram (prog, src, name, numHeaders, headers, includeNames);
-
-  if (NVRTC_err != NVRTC_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "nvrtcCreateProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_nvrtcDestroyProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
-
-  const nvrtcResult NVRTC_err = nvrtc->nvrtcDestroyProgram (prog);
-
-  if (NVRTC_err != NVRTC_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "nvrtcDestroyProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_nvrtcCompileProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, int numOptions, const char * const *options)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
-
-  const nvrtcResult NVRTC_err = nvrtc->nvrtcCompileProgram (prog, numOptions, options);
-
-  if (NVRTC_err != NVRTC_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "nvrtcCompileProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_nvrtcGetProgramLogSize (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *logSizeRet)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
-
-  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetProgramLogSize (prog, logSizeRet);
-
-  if (NVRTC_err != NVRTC_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "nvrtcGetProgramLogSize(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_nvrtcGetProgramLog (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *log)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
-
-  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetProgramLog (prog, log);
-
-  if (NVRTC_err != NVRTC_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "nvrtcGetProgramLog(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_nvrtcGetPTXSize (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *ptxSizeRet)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
-
-  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetPTXSize (prog, ptxSizeRet);
-
-  if (NVRTC_err != NVRTC_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "nvrtcGetPTXSize(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_nvrtcGetPTX (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *ptx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
-
-  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetPTX (prog, ptx);
-
-  if (NVRTC_err != NVRTC_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "nvrtcGetPTX(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_nvrtcVersion (hashcat_ctx_t *hashcat_ctx, int *major, int *minor)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
-
-  const nvrtcResult NVRTC_err = nvrtc->nvrtcVersion (major, minor);
-
-  if (NVRTC_err != NVRTC_SUCCESS)
-  {
-    event_log_error (hashcat_ctx, "nvrtcVersion(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
-
-    return -1;
-  }
-
-  return 0;
-}
-
-// CUDA
-
-int cuda_init (hashcat_ctx_t *hashcat_ctx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  memset (cuda, 0, sizeof (CUDA_PTR));
-
-  #if   defined (_WIN)
-  cuda->lib = hc_dlopen ("nvcuda.dll");
-  #elif defined (__APPLE__)
-  cuda->lib = hc_dlopen ("nvcuda.dylib");
-  #elif defined (__CYGWIN__)
-  cuda->lib = hc_dlopen ("nvcuda.dll");
-  #else
-  cuda->lib = hc_dlopen ("libcuda.so");
-
-  if (cuda->lib == NULL) cuda->lib = hc_dlopen ("libcuda.so.1");
-  #endif
-
-  if (cuda->lib == NULL) return -1;
-
-  #define HC_LOAD_FUNC_CUDA(ptr,name,cudaname,type,libname,noerr) \
-    do { \
-      ptr->name = (type) hc_dlsym ((ptr)->lib, #cudaname); \
-      if ((noerr) != -1) { \
-        if (!(ptr)->name) { \
-          if ((noerr) == 1) { \
-            event_log_error (hashcat_ctx, "%s is missing from %s shared library.", #name, #libname); \
-            return -1; \
-          } \
-          if ((noerr) != 1) { \
-            event_log_warning (hashcat_ctx, "%s is missing from %s shared library.", #name, #libname); \
-            return 0; \
-          } \
-        } \
-      } \
-    } while (0)
-
-  // finding the right symbol is a PITA, because of the _v2 suffix
-  // a good reference is cuda.h itself
-  // this needs to be verified for each new cuda release
-
-  HC_LOAD_FUNC_CUDA (cuda, cuCtxCreate,              cuCtxCreate_v2,            CUDA_CUCTXCREATE,               CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuCtxDestroy,             cuCtxDestroy_v2,           CUDA_CUCTXDESTROY,              CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuCtxGetCacheConfig,      cuCtxGetCacheConfig,       CUDA_CUCTXGETCACHECONFIG,       CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuCtxGetCurrent,          cuCtxGetCurrent,           CUDA_CUCTXGETCURRENT,           CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuCtxGetSharedMemConfig,  cuCtxGetSharedMemConfig,   CUDA_CUCTXGETSHAREDMEMCONFIG,   CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuCtxPopCurrent,          cuCtxPopCurrent_v2,        CUDA_CUCTXPOPCURRENT,           CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuCtxPushCurrent,         cuCtxPushCurrent_v2,       CUDA_CUCTXPUSHCURRENT,          CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuCtxSetCacheConfig,      cuCtxSetCacheConfig,       CUDA_CUCTXSETCACHECONFIG,       CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuCtxSetCurrent,          cuCtxSetCurrent,           CUDA_CUCTXSETCURRENT,           CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuCtxSetSharedMemConfig,  cuCtxSetSharedMemConfig,   CUDA_CUCTXSETSHAREDMEMCONFIG,   CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuCtxSynchronize,         cuCtxSynchronize,          CUDA_CUCTXSYNCHRONIZE,          CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGetAttribute,     cuDeviceGetAttribute,      CUDA_CUDEVICEGETATTRIBUTE,      CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGetCount,         cuDeviceGetCount,          CUDA_CUDEVICEGETCOUNT,          CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGet,              cuDeviceGet,               CUDA_CUDEVICEGET,               CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGetName,          cuDeviceGetName,           CUDA_CUDEVICEGETNAME,           CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuDeviceTotalMem,         cuDeviceTotalMem_v2,       CUDA_CUDEVICETOTALMEM,          CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuDriverGetVersion,       cuDriverGetVersion,        CUDA_CUDRIVERGETVERSION,        CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuEventCreate,            cuEventCreate,             CUDA_CUEVENTCREATE,             CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuEventDestroy,           cuEventDestroy_v2,         CUDA_CUEVENTDESTROY,            CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuEventElapsedTime,       cuEventElapsedTime,        CUDA_CUEVENTELAPSEDTIME,        CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuEventQuery,             cuEventQuery,              CUDA_CUEVENTQUERY,              CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuEventRecord,            cuEventRecord,             CUDA_CUEVENTRECORD,             CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuEventSynchronize,       cuEventSynchronize,        CUDA_CUEVENTSYNCHRONIZE,        CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuFuncGetAttribute,       cuFuncGetAttribute,        CUDA_CUFUNCGETATTRIBUTE,        CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuFuncSetAttribute,       cuFuncSetAttribute,        CUDA_CUFUNCSETATTRIBUTE,        CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuFuncSetCacheConfig,     cuFuncSetCacheConfig,      CUDA_CUFUNCSETCACHECONFIG,      CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuFuncSetSharedMemConfig, cuFuncSetSharedMemConfig,  CUDA_CUFUNCSETSHAREDMEMCONFIG,  CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuGetErrorName,           cuGetErrorName,            CUDA_CUGETERRORNAME,            CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuGetErrorString,         cuGetErrorString,          CUDA_CUGETERRORSTRING,          CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuInit,                   cuInit,                    CUDA_CUINIT,                    CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuLaunchKernel,           cuLaunchKernel,            CUDA_CULAUNCHKERNEL,            CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemAlloc,               cuMemAlloc_v2,             CUDA_CUMEMALLOC,                CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemAllocHost,           cuMemAllocHost_v2,         CUDA_CUMEMALLOCHOST,            CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoDAsync,        cuMemcpyDtoDAsync_v2,      CUDA_CUMEMCPYDTODASYNC,         CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoHAsync,        cuMemcpyDtoHAsync_v2,      CUDA_CUMEMCPYDTOHASYNC,         CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoDAsync,        cuMemcpyHtoDAsync_v2,      CUDA_CUMEMCPYHTODASYNC,         CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemFree,                cuMemFree_v2,              CUDA_CUMEMFREE,                 CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemFreeHost,            cuMemFreeHost,             CUDA_CUMEMFREEHOST,             CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemGetInfo,             cuMemGetInfo_v2,           CUDA_CUMEMGETINFO,              CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32Async,         cuMemsetD32Async,          CUDA_CUMEMSETD32ASYNC,          CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8Async,          cuMemsetD8Async,           CUDA_CUMEMSETD8ASYNC,           CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuModuleGetFunction,      cuModuleGetFunction,       CUDA_CUMODULEGETFUNCTION,       CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuModuleGetGlobal,        cuModuleGetGlobal_v2,      CUDA_CUMODULEGETGLOBAL,         CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuModuleLoad,             cuModuleLoad,              CUDA_CUMODULELOAD,              CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuModuleLoadData,         cuModuleLoadData,          CUDA_CUMODULELOADDATA,          CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuModuleLoadDataEx,       cuModuleLoadDataEx,        CUDA_CUMODULELOADDATAEX,        CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuModuleUnload,           cuModuleUnload,            CUDA_CUMODULEUNLOAD,            CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuProfilerStart,          cuProfilerStart,           CUDA_CUPROFILERSTART,           CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuProfilerStop,           cuProfilerStop,            CUDA_CUPROFILERSTOP,            CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuStreamCreate,           cuStreamCreate,            CUDA_CUSTREAMCREATE,            CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuStreamDestroy,          cuStreamDestroy_v2,        CUDA_CUSTREAMDESTROY,           CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuStreamSynchronize,      cuStreamSynchronize,       CUDA_CUSTREAMSYNCHRONIZE,       CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuStreamWaitEvent,        cuStreamWaitEvent,         CUDA_CUSTREAMWAITEVENT,         CUDA, 1);
-  #if defined (WITH_CUBIN)
-  HC_LOAD_FUNC_CUDA (cuda, cuLinkCreate,             cuLinkCreate_v2,           CUDA_CULINKCREATE,              CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuLinkAddData,            cuLinkAddData_v2,          CUDA_CULINKADDDATA,             CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuLinkDestroy,            cuLinkDestroy,             CUDA_CULINKDESTROY,             CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuLinkComplete,           cuLinkComplete,            CUDA_CULINKCOMPLETE,            CUDA, 1);
-  #endif
-
-  return 0;
-}
-
-void cuda_close (hashcat_ctx_t *hashcat_ctx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  if (cuda)
-  {
-    if (cuda->lib)
-    {
-      hc_dlclose (cuda->lib);
-    }
-
-    hcfree (backend_ctx->cuda);
-
-    backend_ctx->cuda = NULL;
-  }
-}
-
-int hc_cuInit (hashcat_ctx_t *hashcat_ctx, unsigned int Flags)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuInit (Flags);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuInit(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuInit(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuDeviceGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuDeviceGetAttribute (pi, attrib, dev);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuDeviceGetAttribute(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuDeviceGetAttribute(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuDeviceGetCount (hashcat_ctx_t *hashcat_ctx, int *count)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuDeviceGetCount (count);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuDeviceGetCount(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuDeviceGetCount(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuDeviceGet (hashcat_ctx_t *hashcat_ctx, CUdevice* device, int ordinal)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuDeviceGet (device, ordinal);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuDeviceGet(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuDeviceGet(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuDeviceGetName (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevice dev)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuDeviceGetName (name, len, dev);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuDeviceGetName(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuDeviceGetName(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuDeviceTotalMem (hashcat_ctx_t *hashcat_ctx, size_t *bytes, CUdevice dev)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuDeviceTotalMem (bytes, dev);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuDeviceTotalMem(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuDeviceTotalMem(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuDriverGetVersion (hashcat_ctx_t *hashcat_ctx, int *driverVersion)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuDriverGetVersion (driverVersion);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuDriverGetVersion(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuDriverGetVersion(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuCtxCreate (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuCtxCreate (pctx, flags, dev);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuCtxCreate(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuCtxCreate(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuCtxDestroy (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuCtxDestroy (ctx);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuCtxDestroy(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuCtxDestroy(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuModuleLoadDataEx (module, image, numOptions, options, optionValues);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuModuleLoadDataEx(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuModuleLoadDataEx(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuModuleUnload (hashcat_ctx_t *hashcat_ctx, CUmodule hmod)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuModuleUnload (hmod);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuModuleUnload(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuModuleUnload(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuCtxSetCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuCtxSetCurrent (ctx);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuCtxSetCurrent(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuCtxSetCurrent(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuMemAlloc (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuMemAlloc (dptr, bytesize);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuMemAlloc(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuMemAlloc(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuMemFree (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuMemFree (dptr);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuMemFree(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuMemFree(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuMemcpyDtoHAsync (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuMemcpyDtoHAsync (dstHost, srcDevice, ByteCount, hStream);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuMemcpyDtoHAsync(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuMemcpyDtoHAsync(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuMemcpyDtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuMemcpyDtoDAsync (dstDevice, srcDevice, ByteCount, hStream);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuMemcpyDtoDAsync(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuMemcpyDtoDAsync(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuMemcpyHtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuMemcpyHtoDAsync (dstDevice, srcHost, ByteCount, hStream);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuMemcpyHtoDAsync(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuMemcpyHtoDAsync(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuMemsetD32Async (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuMemsetD32Async (dstDevice, ui, N, hStream);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuMemsetD32Async(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuMemsetD32Async(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuMemsetD8Async (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuMemsetD8Async (dstDevice, uc, N, hStream);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuMemsetD8Async(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuMemsetD8Async(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuModuleGetFunction (hfunc, hmod, name);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuModuleGetFunction(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuModuleGetFunction(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuModuleGetGlobal (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuModuleGetGlobal (dptr, bytes, hmod, name);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuMemGetInfo (hashcat_ctx_t *hashcat_ctx, size_t *free, size_t *total)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuMemGetInfo (free, total);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuMemGetInfo(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuMemGetInfo(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuFuncGetAttribute (pi, attrib, hfunc);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuFuncGetAttribute(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuFuncGetAttribute(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuFuncSetAttribute (hashcat_ctx_t *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuFuncSetAttribute (hfunc, attrib, value);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuStreamCreate (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuStreamCreate (phStream, Flags);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuStreamCreate(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuStreamCreate(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuStreamDestroy (hashcat_ctx_t *hashcat_ctx, CUstream hStream)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuStreamDestroy (hStream);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuStreamDestroy(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuStreamDestroy(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuStreamSynchronize (hashcat_ctx_t *hashcat_ctx, CUstream hStream)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuStreamSynchronize (hStream);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuStreamSynchronize(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuStreamSynchronize(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuLaunchKernel (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuLaunchKernel (f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuLaunchKernel(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuLaunchKernel(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuCtxSynchronize (hashcat_ctx_t *hashcat_ctx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuCtxSynchronize ();
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuCtxSynchronize(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuCtxSynchronize(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuEventCreate (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuEventCreate (phEvent, Flags);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuEventCreate(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuEventCreate(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuEventDestroy (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuEventDestroy (hEvent);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuEventDestroy(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuEventDestroy(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuEventElapsedTime (pMilliseconds, hStart, hEnd);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuEventQuery (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuEventQuery (hEvent);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuEventQuery(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuEventQuery(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuEventRecord (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuEventRecord (hEvent, hStream);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuEventRecord(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuEventRecord(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuEventSynchronize (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuEventSynchronize (hEvent);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuEventSynchronize(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuEventSynchronize(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuCtxSetCacheConfig (hashcat_ctx_t *hashcat_ctx, CUfunc_cache config)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuCtxSetCacheConfig (config);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuCtxPushCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuCtxPushCurrent (ctx);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuCtxPushCurrent(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuCtxPushCurrent(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuCtxPopCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuCtxPopCurrent (pctx);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuCtxPopCurrent(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuCtxPopCurrent(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuLinkCreate (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuLinkCreate (numOptions, options, optionValues, stateOut);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuLinkCreate(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuLinkCreate(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuLinkAddData (hashcat_ctx_t *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuLinkAddData (state, type, data, size, name, numOptions, options, optionValues);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuLinkAddData(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuLinkAddData(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuLinkDestroy (hashcat_ctx_t *hashcat_ctx, CUlinkState state)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuLinkDestroy (state);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuLinkDestroy(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuLinkDestroy(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-int hc_cuLinkComplete (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut)
-{
-  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
-
-  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
-
-  const CUresult CU_err = cuda->cuLinkComplete (state, cubinOut, sizeOut);
-
-  if (CU_err != CUDA_SUCCESS)
-  {
-    const char *pStr = NULL;
-
-    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
-    {
-      event_log_error (hashcat_ctx, "cuLinkComplete(): %s", pStr);
-    }
-    else
-    {
-      event_log_error (hashcat_ctx, "cuLinkComplete(): %d", CU_err);
-    }
-
-    return -1;
-  }
-
-  return 0;
-}
-
-// Backend
-
 int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 gidd, pw_t *pw)
 {
   pw_idx_t pw_idx;
diff --git a/src/ext_cuda.c b/src/ext_cuda.c
index dc43e1b61..dbafa065e 100644
--- a/src/ext_cuda.c
+++ b/src/ext_cuda.c
@@ -5,4 +5,1267 @@
 
 #include "common.h"
 #include "types.h"
+#include "memory.h"
+#include "event.h"
 #include "ext_cuda.h"
+
+#include "dynloader.h"
+
+int cuda_init (void *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  memset (cuda, 0, sizeof (CUDA_PTR));
+
+  #if   defined (_WIN)
+  cuda->lib = hc_dlopen ("nvcuda.dll");
+  #elif defined (__APPLE__)
+  cuda->lib = hc_dlopen ("nvcuda.dylib");
+  #elif defined (__CYGWIN__)
+  cuda->lib = hc_dlopen ("nvcuda.dll");
+  #else
+  cuda->lib = hc_dlopen ("libcuda.so");
+
+  if (cuda->lib == NULL) cuda->lib = hc_dlopen ("libcuda.so.1");
+  #endif
+
+  if (cuda->lib == NULL) return -1;
+
+  #define HC_LOAD_FUNC_CUDA(ptr,name,cudaname,type,libname,noerr) \
+    do { \
+      ptr->name = (type) hc_dlsym ((ptr)->lib, #cudaname); \
+      if ((noerr) != -1) { \
+        if (!(ptr)->name) { \
+          if ((noerr) == 1) { \
+            event_log_error (hashcat_ctx, "%s is missing from %s shared library.", #name, #libname); \
+            return -1; \
+          } \
+          if ((noerr) != 1) { \
+            event_log_warning (hashcat_ctx, "%s is missing from %s shared library.", #name, #libname); \
+            return 0; \
+          } \
+        } \
+      } \
+    } while (0)
+
+  // finding the right symbol is a PITA, because of the _v2 suffix
+  // a good reference is cuda.h itself
+  // this needs to be verified for each new cuda release
+
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxCreate,              cuCtxCreate_v2,            CUDA_CUCTXCREATE,               CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxDestroy,             cuCtxDestroy_v2,           CUDA_CUCTXDESTROY,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxGetCacheConfig,      cuCtxGetCacheConfig,       CUDA_CUCTXGETCACHECONFIG,       CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxGetCurrent,          cuCtxGetCurrent,           CUDA_CUCTXGETCURRENT,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxGetSharedMemConfig,  cuCtxGetSharedMemConfig,   CUDA_CUCTXGETSHAREDMEMCONFIG,   CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxPopCurrent,          cuCtxPopCurrent_v2,        CUDA_CUCTXPOPCURRENT,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxPushCurrent,         cuCtxPushCurrent_v2,       CUDA_CUCTXPUSHCURRENT,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxSetCacheConfig,      cuCtxSetCacheConfig,       CUDA_CUCTXSETCACHECONFIG,       CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxSetCurrent,          cuCtxSetCurrent,           CUDA_CUCTXSETCURRENT,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxSetSharedMemConfig,  cuCtxSetSharedMemConfig,   CUDA_CUCTXSETSHAREDMEMCONFIG,   CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuCtxSynchronize,         cuCtxSynchronize,          CUDA_CUCTXSYNCHRONIZE,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGetAttribute,     cuDeviceGetAttribute,      CUDA_CUDEVICEGETATTRIBUTE,      CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGetCount,         cuDeviceGetCount,          CUDA_CUDEVICEGETCOUNT,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGet,              cuDeviceGet,               CUDA_CUDEVICEGET,               CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDeviceGetName,          cuDeviceGetName,           CUDA_CUDEVICEGETNAME,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDeviceTotalMem,         cuDeviceTotalMem_v2,       CUDA_CUDEVICETOTALMEM,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuDriverGetVersion,       cuDriverGetVersion,        CUDA_CUDRIVERGETVERSION,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventCreate,            cuEventCreate,             CUDA_CUEVENTCREATE,             CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventDestroy,           cuEventDestroy_v2,         CUDA_CUEVENTDESTROY,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventElapsedTime,       cuEventElapsedTime,        CUDA_CUEVENTELAPSEDTIME,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventQuery,             cuEventQuery,              CUDA_CUEVENTQUERY,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventRecord,            cuEventRecord,             CUDA_CUEVENTRECORD,             CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuEventSynchronize,       cuEventSynchronize,        CUDA_CUEVENTSYNCHRONIZE,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuFuncGetAttribute,       cuFuncGetAttribute,        CUDA_CUFUNCGETATTRIBUTE,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuFuncSetAttribute,       cuFuncSetAttribute,        CUDA_CUFUNCSETATTRIBUTE,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuFuncSetCacheConfig,     cuFuncSetCacheConfig,      CUDA_CUFUNCSETCACHECONFIG,      CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuFuncSetSharedMemConfig, cuFuncSetSharedMemConfig,  CUDA_CUFUNCSETSHAREDMEMCONFIG,  CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuGetErrorName,           cuGetErrorName,            CUDA_CUGETERRORNAME,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuGetErrorString,         cuGetErrorString,          CUDA_CUGETERRORSTRING,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuInit,                   cuInit,                    CUDA_CUINIT,                    CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuLaunchKernel,           cuLaunchKernel,            CUDA_CULAUNCHKERNEL,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemAlloc,               cuMemAlloc_v2,             CUDA_CUMEMALLOC,                CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemAllocHost,           cuMemAllocHost_v2,         CUDA_CUMEMALLOCHOST,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoDAsync,        cuMemcpyDtoDAsync_v2,      CUDA_CUMEMCPYDTODASYNC,         CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoHAsync,        cuMemcpyDtoHAsync_v2,      CUDA_CUMEMCPYDTOHASYNC,         CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoDAsync,        cuMemcpyHtoDAsync_v2,      CUDA_CUMEMCPYHTODASYNC,         CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemFree,                cuMemFree_v2,              CUDA_CUMEMFREE,                 CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemFreeHost,            cuMemFreeHost,             CUDA_CUMEMFREEHOST,             CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemGetInfo,             cuMemGetInfo_v2,           CUDA_CUMEMGETINFO,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32Async,         cuMemsetD32Async,          CUDA_CUMEMSETD32ASYNC,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8Async,          cuMemsetD8Async,           CUDA_CUMEMSETD8ASYNC,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleGetFunction,      cuModuleGetFunction,       CUDA_CUMODULEGETFUNCTION,       CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleGetGlobal,        cuModuleGetGlobal_v2,      CUDA_CUMODULEGETGLOBAL,         CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleLoad,             cuModuleLoad,              CUDA_CUMODULELOAD,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleLoadData,         cuModuleLoadData,          CUDA_CUMODULELOADDATA,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleLoadDataEx,       cuModuleLoadDataEx,        CUDA_CUMODULELOADDATAEX,        CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuModuleUnload,           cuModuleUnload,            CUDA_CUMODULEUNLOAD,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuProfilerStart,          cuProfilerStart,           CUDA_CUPROFILERSTART,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuProfilerStop,           cuProfilerStop,            CUDA_CUPROFILERSTOP,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuStreamCreate,           cuStreamCreate,            CUDA_CUSTREAMCREATE,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuStreamDestroy,          cuStreamDestroy_v2,        CUDA_CUSTREAMDESTROY,           CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuStreamSynchronize,      cuStreamSynchronize,       CUDA_CUSTREAMSYNCHRONIZE,       CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuStreamWaitEvent,        cuStreamWaitEvent,         CUDA_CUSTREAMWAITEVENT,         CUDA, 1);
+  #if defined (WITH_CUBIN)
+  HC_LOAD_FUNC_CUDA (cuda, cuLinkCreate,             cuLinkCreate_v2,           CUDA_CULINKCREATE,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuLinkAddData,            cuLinkAddData_v2,          CUDA_CULINKADDDATA,             CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuLinkDestroy,            cuLinkDestroy,             CUDA_CULINKDESTROY,             CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuLinkComplete,           cuLinkComplete,            CUDA_CULINKCOMPLETE,            CUDA, 1);
+  #endif
+
+  return 0;
+}
+
+void cuda_close (void *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  if (cuda)
+  {
+    if (cuda->lib)
+    {
+      hc_dlclose (cuda->lib);
+    }
+
+    hcfree (backend_ctx->cuda);
+
+    backend_ctx->cuda = NULL;
+  }
+}
+
+int hc_cuInit (void *hashcat_ctx, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuInit (Flags);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuInit(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuInit(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDeviceGetAttribute (void *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDeviceGetAttribute (pi, attrib, dev);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetAttribute(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDeviceGetCount (void *hashcat_ctx, int *count)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDeviceGetCount (count);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetCount(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetCount(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDeviceGet (void *hashcat_ctx, CUdevice* device, int ordinal)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDeviceGet (device, ordinal);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGet(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGet(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDeviceGetName (void *hashcat_ctx, char *name, int len, CUdevice dev)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDeviceGetName (name, len, dev);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetName(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDeviceGetName(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDeviceTotalMem (void *hashcat_ctx, size_t *bytes, CUdevice dev)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDeviceTotalMem (bytes, dev);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDeviceTotalMem(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDeviceTotalMem(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuDriverGetVersion (void *hashcat_ctx, int *driverVersion)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuDriverGetVersion (driverVersion);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuDriverGetVersion(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuDriverGetVersion(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxCreate (void *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxCreate (pctx, flags, dev);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxCreate(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxDestroy (void *hashcat_ctx, CUcontext ctx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxDestroy (ctx);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxDestroy(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuModuleLoadDataEx (void *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuModuleLoadDataEx (module, image, numOptions, options, optionValues);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuModuleLoadDataEx(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuModuleLoadDataEx(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuModuleUnload (void *hashcat_ctx, CUmodule hmod)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuModuleUnload (hmod);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuModuleUnload(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuModuleUnload(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxSetCurrent (void *hashcat_ctx, CUcontext ctx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxSetCurrent (ctx);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCurrent(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCurrent(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemAlloc (void *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemAlloc (dptr, bytesize);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemAlloc(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemAlloc(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemFree (void *hashcat_ctx, CUdeviceptr dptr)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemFree (dptr);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemFree(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemFree(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyDtoHAsync (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyDtoHAsync (dstHost, srcDevice, ByteCount, hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoHAsync(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoHAsync(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyDtoDAsync (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyDtoDAsync (dstDevice, srcDevice, ByteCount, hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoDAsync(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoDAsync(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyHtoDAsync (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyHtoDAsync (dstDevice, srcHost, ByteCount, hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyHtoDAsync(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyHtoDAsync(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemsetD32Async (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemsetD32Async (dstDevice, ui, N, hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD32Async(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD32Async(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemsetD8Async (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemsetD8Async (dstDevice, uc, N, hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD8Async(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD8Async(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuModuleGetFunction (void *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuModuleGetFunction (hfunc, hmod, name);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuModuleGetFunction(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuModuleGetFunction(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuModuleGetGlobal (void *hashcat_ctx, CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuModuleGetGlobal (dptr, bytes, hmod, name);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemGetInfo (void *hashcat_ctx, size_t *free, size_t *total)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemGetInfo (free, total);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemGetInfo(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemGetInfo(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuFuncGetAttribute (void *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuFuncGetAttribute (pi, attrib, hfunc);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuFuncGetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuFuncGetAttribute(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuFuncSetAttribute (void *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuFuncSetAttribute (hfunc, attrib, value);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuStreamCreate (void *hashcat_ctx, CUstream *phStream, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuStreamCreate (phStream, Flags);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuStreamCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuStreamCreate(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuStreamDestroy (void *hashcat_ctx, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuStreamDestroy (hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuStreamDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuStreamDestroy(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuStreamSynchronize (void *hashcat_ctx, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuStreamSynchronize (hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuStreamSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuStreamSynchronize(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuLaunchKernel (void *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuLaunchKernel (f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLaunchKernel(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLaunchKernel(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxSynchronize (void *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxSynchronize ();
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxSynchronize(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventCreate (void *hashcat_ctx, CUevent *phEvent, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventCreate (phEvent, Flags);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventCreate(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventDestroy (void *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventDestroy (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventDestroy(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventElapsedTime (void *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventElapsedTime (pMilliseconds, hStart, hEnd);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventQuery (void *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventQuery (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventQuery(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventQuery(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventRecord (void *hashcat_ctx, CUevent hEvent, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventRecord (hEvent, hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventRecord(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventRecord(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventSynchronize (void *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventSynchronize (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventSynchronize(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxSetCacheConfig (void *hashcat_ctx, CUfunc_cache config)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxSetCacheConfig (config);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxPushCurrent (void *hashcat_ctx, CUcontext ctx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxPushCurrent (ctx);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxPushCurrent(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxPushCurrent(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuCtxPopCurrent (void *hashcat_ctx, CUcontext *pctx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxPopCurrent (pctx);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxPopCurrent(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxPopCurrent(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuLinkCreate (void *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuLinkCreate (numOptions, options, optionValues, stateOut);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLinkCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLinkCreate(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuLinkAddData (void *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuLinkAddData (state, type, data, size, name, numOptions, options, optionValues);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLinkAddData(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLinkAddData(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuLinkDestroy (void *hashcat_ctx, CUlinkState state)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuLinkDestroy (state);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLinkDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLinkDestroy(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuLinkComplete (void *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuLinkComplete (state, cubinOut, sizeOut);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLinkComplete(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLinkComplete(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
diff --git a/src/ext_nvrtc.c b/src/ext_nvrtc.c
index 91d68c0bd..ed4ce358c 100644
--- a/src/ext_nvrtc.c
+++ b/src/ext_nvrtc.c
@@ -5,8 +5,12 @@
 
 #include "common.h"
 #include "types.h"
+#include "memory.h"
+#include "event.h"
 #include "ext_nvrtc.h"
 
+#include "dynloader.h"
+
 int nvrtc_make_options_array_from_string (char *string, char **options)
 {
   char *saveptr = NULL;
@@ -25,3 +29,238 @@ int nvrtc_make_options_array_from_string (char *string, char **options)
 
   return cnt;
 }
+
+// NVRTC
+
+int nvrtc_init (void *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  memset (nvrtc, 0, sizeof (NVRTC_PTR));
+
+  #if   defined (_WIN)
+  nvrtc->lib = hc_dlopen ("nvrtc.dll");
+
+  if (nvrtc->lib == NULL)
+  {
+    // super annoying: nvidia is using the CUDA version in nvrtc???.dll filename!
+    // however, the cuda version string comes from nvcuda.dll which is from nvidia driver, but
+    // the driver version and the installed CUDA toolkit version can be different, so it cannot be used as a reference.
+    // brute force to the rescue
+
+    char dllname[100];
+
+    for (int major = 20; major >= 9; major--) // older than 3.x do not ship _v2 functions anyway
+                                              // older than 7.x does not support sm 5.x
+                                              // older than 8.x does not have documentation archive online, no way to check if nvrtc support whatever we need
+                                              // older than 9.x is just a theoretical limit since we define 9.0 as the minimum required version
+    {
+      for (int minor = 20; minor >= 0; minor--)
+      {
+        snprintf (dllname, sizeof (dllname), "nvrtc64_%d%d.dll", major, minor);
+
+        nvrtc->lib = hc_dlopen (dllname);
+
+        if (nvrtc->lib) break;
+
+        snprintf (dllname, sizeof (dllname), "nvrtc64_%d%d_0.dll", major, minor);
+
+        nvrtc->lib = hc_dlopen (dllname);
+
+        if (nvrtc->lib) break;
+      }
+
+      if (nvrtc->lib) break;
+    }
+  }
+  #elif defined (__APPLE__)
+  nvrtc->lib = hc_dlopen ("nvrtc.dylib");
+  #elif defined (__CYGWIN__)
+  nvrtc->lib = hc_dlopen ("nvrtc.dll");
+  #else
+  nvrtc->lib = hc_dlopen ("libnvrtc.so");
+
+  if (nvrtc->lib == NULL) nvrtc->lib = hc_dlopen ("libnvrtc.so.1");
+  #endif
+
+  if (nvrtc->lib == NULL) return -1;
+
+  HC_LOAD_FUNC (nvrtc, nvrtcAddNameExpression,  NVRTC_NVRTCADDNAMEEXPRESSION, NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcCompileProgram,     NVRTC_NVRTCCOMPILEPROGRAM,    NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcCreateProgram,      NVRTC_NVRTCCREATEPROGRAM,     NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcDestroyProgram,     NVRTC_NVRTCDESTROYPROGRAM,    NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetLoweredName,     NVRTC_NVRTCGETLOWEREDNAME,    NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetPTX,             NVRTC_NVRTCGETPTX,            NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetPTXSize,         NVRTC_NVRTCGETPTXSIZE,        NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetProgramLog,      NVRTC_NVRTCGETPROGRAMLOG,     NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetProgramLogSize,  NVRTC_NVRTCGETPROGRAMLOGSIZE, NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetErrorString,     NVRTC_NVRTCGETERRORSTRING,    NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcVersion,            NVRTC_NVRTCVERSION,           NVRTC, 1);
+
+  return 0;
+}
+
+void nvrtc_close (void *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  if (nvrtc)
+  {
+    if (nvrtc->lib)
+    {
+      hc_dlclose (nvrtc->lib);
+    }
+
+    hcfree (backend_ctx->nvrtc);
+
+    backend_ctx->nvrtc = NULL;
+  }
+}
+
+int hc_nvrtcCreateProgram (void *hashcat_ctx, nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcCreateProgram (prog, src, name, numHeaders, headers, includeNames);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcCreateProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcDestroyProgram (void *hashcat_ctx, nvrtcProgram *prog)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcDestroyProgram (prog);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcDestroyProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcCompileProgram (void *hashcat_ctx, nvrtcProgram prog, int numOptions, const char * const *options)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcCompileProgram (prog, numOptions, options);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcCompileProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetProgramLogSize (void *hashcat_ctx, nvrtcProgram prog, size_t *logSizeRet)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetProgramLogSize (prog, logSizeRet);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetProgramLogSize(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetProgramLog (void *hashcat_ctx, nvrtcProgram prog, char *log)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetProgramLog (prog, log);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetProgramLog(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetPTXSize (void *hashcat_ctx, nvrtcProgram prog, size_t *ptxSizeRet)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetPTXSize (prog, ptxSizeRet);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetPTXSize(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetPTX (void *hashcat_ctx, nvrtcProgram prog, char *ptx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetPTX (prog, ptx);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetPTX(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcVersion (void *hashcat_ctx, int *major, int *minor)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcVersion (major, minor);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcVersion(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}