CUDA Backend: moved functions to ext_cuda.c/ext_nvrtc.c and includes to ext_cuda.h/ext_nvrtc.h

2025-07-18 20:49:19 +00:00 · 2022-01-03 16:29:15 +01:00 · 2022-01-03 16:29:15 +01:00 · f8ceb8785e
commit f8ceb8785e
parent 2344cb0365
7 changed files with 2478 additions and 2454 deletions
--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -39,6 +39,7 @@
 - Unit tests: Updated test.sh to set default device-type to CPU with Apple Intel and added -f (--force) option
 - OpenCL Backend: moved functions to ext_OpenCL.c and includes to ext_OpenCL.h
 - HIP Backend: moved functions to ext_hip.c/ext_hiprtc.c and includes to ext_hip.h/ext_hiprtc.h
+- CUDA Backend: moved functions to ext_cuda.c/ext_nvrtc.c and includes to ext_cuda.h/ext_nvrtc.h

 * changes v6.2.4 -> v6.2.5

--- a/include/backend.h
+++ b/include/backend.h
@ -23,61 +23,30 @@ static const char CL_VENDOR_MESA[]              = "Mesa";
 static const char CL_VENDOR_NV[]                = "NVIDIA Corporation";
 static const char CL_VENDOR_POCL[]              = "The pocl project";

-int  cuda_init    (hashcat_ctx_t *hashcat_ctx);
-void cuda_close   (hashcat_ctx_t *hashcat_ctx);
+int  backend_ctx_init                       (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_destroy                    (hashcat_ctx_t *hashcat_ctx);

-int  nvrtc_init   (hashcat_ctx_t *hashcat_ctx);
-void nvrtc_close  (hashcat_ctx_t *hashcat_ctx);
+int  backend_ctx_devices_init               (hashcat_ctx_t *hashcat_ctx, const int comptime);
+void backend_ctx_devices_destroy            (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_devices_sync_tuning        (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_devices_update_power       (hashcat_ctx_t *hashcat_ctx);
+void backend_ctx_devices_kernel_loops       (hashcat_ctx_t *hashcat_ctx);

-int hc_nvrtcCreateProgram        (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames);
-int hc_nvrtcDestroyProgram       (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog);
-int hc_nvrtcCompileProgram       (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, int numOptions, const char * const *options);
-int hc_nvrtcGetProgramLogSize    (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *logSizeRet);
-int hc_nvrtcGetProgramLog        (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *log);
-int hc_nvrtcGetPTXSize           (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *ptxSizeRet);
-int hc_nvrtcGetPTX               (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *ptx);
-int hc_nvrtcVersion              (hashcat_ctx_t *hashcat_ctx, int *major, int *minor);
+int  backend_session_begin                  (hashcat_ctx_t *hashcat_ctx);
+void backend_session_destroy                (hashcat_ctx_t *hashcat_ctx);
+void backend_session_reset                  (hashcat_ctx_t *hashcat_ctx);
+int  backend_session_update_combinator      (hashcat_ctx_t *hashcat_ctx);
+int  backend_session_update_mp              (hashcat_ctx_t *hashcat_ctx);
+int  backend_session_update_mp_rl           (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_l, const u32 css_cnt_r);

-int hc_cuCtxCreate               (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev);
-int hc_cuCtxDestroy              (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
-int hc_cuCtxSetCurrent           (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
-int hc_cuCtxSetCacheConfig       (hashcat_ctx_t *hashcat_ctx, CUfunc_cache config);
-int hc_cuCtxSynchronize          (hashcat_ctx_t *hashcat_ctx);
-int hc_cuDeviceGetAttribute      (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev);
-int hc_cuDeviceGetCount          (hashcat_ctx_t *hashcat_ctx, int *count);
-int hc_cuDeviceGet               (hashcat_ctx_t *hashcat_ctx, CUdevice *device, int ordinal);
-int hc_cuDeviceGetName           (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevice dev);
-int hc_cuDeviceTotalMem          (hashcat_ctx_t *hashcat_ctx, size_t *bytes, CUdevice dev);
-int hc_cuDriverGetVersion        (hashcat_ctx_t *hashcat_ctx, int *driverVersion);
-int hc_cuEventCreate             (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags);
-int hc_cuEventDestroy            (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
-int hc_cuEventElapsedTime        (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd);
-int hc_cuEventQuery              (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
-int hc_cuEventRecord             (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream);
-int hc_cuEventSynchronize        (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
-int hc_cuFuncGetAttribute        (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc);
-int hc_cuFuncSetAttribute        (hashcat_ctx_t *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value);
-int hc_cuInit                    (hashcat_ctx_t *hashcat_ctx, unsigned int Flags);
-int hc_cuLaunchKernel            (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
-int hc_cuMemAlloc                (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
-int hc_cuMemcpyDtoDAsync         (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-int hc_cuMemcpyDtoHAsync         (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-int hc_cuMemcpyHtoDAsync         (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-int hc_cuMemFree                 (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr);
-int hc_cuMemsetD32Async          (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
-int hc_cuMemsetD8Async           (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
-int hc_cuModuleGetFunction       (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
-int hc_cuModuleLoadDataEx        (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-int hc_cuModuleUnload            (hashcat_ctx_t *hashcat_ctx, CUmodule hmod);
-int hc_cuStreamCreate            (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags);
-int hc_cuStreamDestroy           (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
-int hc_cuStreamSynchronize       (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
-int hc_cuCtxPushCurrent          (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
-int hc_cuCtxPopCurrent           (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx);
-int hc_cuLinkCreate              (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
-int hc_cuLinkAddData             (hashcat_ctx_t *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
-int hc_cuLinkDestroy             (hashcat_ctx_t *hashcat_ctx, CUlinkState state);
-int hc_cuLinkComplete            (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut);
+void generate_source_kernel_filename        (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *shared_dir, char *source_file);
+void generate_cached_kernel_filename        (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *cache_dir, const char *device_name_chksum, char *cached_file);
+void generate_source_kernel_shared_filename (char *shared_dir, char *source_file);
+void generate_cached_kernel_shared_filename (char *cache_dir, const char *device_name_chksum, char *cached_file);
+void generate_source_kernel_mp_filename     (const u32 opti_type, const u64 opts_type, char *shared_dir, char *source_file);
+void generate_cached_kernel_mp_filename     (const u32 opti_type, const u64 opts_type, char *cache_dir, const char *device_name_chksum, char *cached_file);
+void generate_source_kernel_amp_filename    (const u32 attack_kern, char *shared_dir, char *source_file);
+void generate_cached_kernel_amp_filename    (const u32 attack_kern, char *cache_dir, const char *device_name_chksum, char *cached_file);

 int gidd_to_pw_t                    (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 gidd, pw_t *pw);

@ -109,31 +78,6 @@ int run_kernel_decompress     (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *de
 int run_copy                        (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt);
 int run_cracker                     (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_pos, const u64 pws_cnt);

-void generate_source_kernel_filename        (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *shared_dir, char *source_file);
-void generate_cached_kernel_filename        (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *cache_dir, const char *device_name_chksum, char *cached_file);
-void generate_source_kernel_shared_filename (char *shared_dir, char *source_file);
-void generate_cached_kernel_shared_filename (char *cache_dir, const char *device_name_chksum, char *cached_file);
-void generate_source_kernel_mp_filename     (const u32 opti_type, const u64 opts_type, char *shared_dir, char *source_file);
-void generate_cached_kernel_mp_filename     (const u32 opti_type, const u64 opts_type, char *cache_dir, const char *device_name_chksum, char *cached_file);
-void generate_source_kernel_amp_filename    (const u32 attack_kern, char *shared_dir, char *source_file);
-void generate_cached_kernel_amp_filename    (const u32 attack_kern, char *cache_dir, const char *device_name_chksum, char *cached_file);
-
-int  backend_ctx_init                  (hashcat_ctx_t *hashcat_ctx);
-void backend_ctx_destroy               (hashcat_ctx_t *hashcat_ctx);
-
-int  backend_ctx_devices_init          (hashcat_ctx_t *hashcat_ctx, const int comptime);
-void backend_ctx_devices_destroy       (hashcat_ctx_t *hashcat_ctx);
-void backend_ctx_devices_sync_tuning   (hashcat_ctx_t *hashcat_ctx);
-void backend_ctx_devices_update_power  (hashcat_ctx_t *hashcat_ctx);
-void backend_ctx_devices_kernel_loops  (hashcat_ctx_t *hashcat_ctx);
-
-int  backend_session_begin             (hashcat_ctx_t *hashcat_ctx);
-void backend_session_destroy           (hashcat_ctx_t *hashcat_ctx);
-void backend_session_reset             (hashcat_ctx_t *hashcat_ctx);
-int  backend_session_update_combinator (hashcat_ctx_t *hashcat_ctx);
-int  backend_session_update_mp         (hashcat_ctx_t *hashcat_ctx);
-int  backend_session_update_mp_rl      (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_l, const u32 css_cnt_r);
-
 void *hook12_thread (void *p);
 void *hook23_thread (void *p);

--- a/include/ext_cuda.h
+++ b/include/ext_cuda.h
@ -34,7 +34,8 @@ typedef struct CUmod_st *CUmodule;                        /**< CUDA module */
 typedef struct CUstream_st *CUstream;                     /**< CUDA stream */
 typedef struct CUlinkState_st *CUlinkState;

-typedef enum cudaError_enum {
+typedef enum cudaError_enum
+{
  /**
   * The API call returned with no errors. In the case of query calls, this
   * also means that the operation being queried is complete (see
@ -530,6 +531,7 @@ typedef enum cudaError_enum {
   * This indicates that an unknown internal error has occurred.
   */
  CUDA_ERROR_UNKNOWN                        = 999
+
 } CUresult;

 /**
@ -710,7 +712,8 @@ typedef enum CUjit_option_enum
 /**
 * Device properties
 */
-typedef enum CUdevice_attribute_enum {
+typedef enum CUdevice_attribute_enum
+{
  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
@ -818,31 +821,37 @@ typedef enum CUdevice_attribute_enum {
  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
  CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */
  CU_DEVICE_ATTRIBUTE_MAX
+
 } CUdevice_attribute;

 /**
 * Function cache configurations
 */
-typedef enum CUfunc_cache_enum {
+typedef enum CUfunc_cache_enum
+{
  CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
  CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
  CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
  CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
+
 } CUfunc_cache;

 /**
 * Shared memory configurations
 */
-typedef enum CUsharedconfig_enum {
+typedef enum CUsharedconfig_enum
+{
  CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
  CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
  CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
+
 } CUsharedconfig;

 /**
 * Function properties
 */
-typedef enum CUfunction_attribute_enum {
+typedef enum CUfunction_attribute_enum
+{
  /**
   * The maximum number of threads per block, beyond which a launch of the
   * function would fail. This number depends on both the function and the
@ -915,12 +924,14 @@ typedef enum CUfunction_attribute_enum {
  CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,

  CU_FUNC_ATTRIBUTE_MAX
+
 } CUfunction_attribute;

 /**
 * Context creation flags
 */
-typedef enum CUctx_flags_enum {
+typedef enum CUctx_flags_enum
+{
  CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
  CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
  CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
@ -932,24 +943,29 @@ typedef enum CUctx_flags_enum {
  CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
  CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
  CU_CTX_FLAGS_MASK          = 0x1f
+
 } CUctx_flags;

 /**
 * Stream creation flags
 */
-typedef enum CUstream_flags_enum {
+typedef enum CUstream_flags_enum
+{
  CU_STREAM_DEFAULT      = 0x0, /**< Default stream flag */
  CU_STREAM_NON_BLOCKING = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+
 } CUstream_flags;

 /**
 * Event creation flags
 */
-typedef enum CUevent_flags_enum {
+typedef enum CUevent_flags_enum
+{
  CU_EVENT_DEFAULT        = 0x0, /**< Default event flag */
  CU_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
  CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
  CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
+
 } CUevent_flags;

 typedef enum CUjitInputType_enum
@ -985,6 +1001,7 @@ typedef enum CUjitInputType_enum
  CU_JIT_INPUT_LIBRARY,

  CU_JIT_NUM_INPUT_TYPES
+
 } CUjitInputType;

 #ifdef _WIN32
@ -1119,4 +1136,50 @@ typedef struct hc_cuda_lib

 typedef hc_cuda_lib_t CUDA_PTR;

+int  cuda_init                 (void *hashcat_ctx);
+void cuda_close                (void *hashcat_ctx);
+
+int hc_cuCtxCreate             (void *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev);
+int hc_cuCtxDestroy            (void *hashcat_ctx, CUcontext ctx);
+int hc_cuCtxSetCurrent         (void *hashcat_ctx, CUcontext ctx);
+int hc_cuCtxSetCacheConfig     (void *hashcat_ctx, CUfunc_cache config);
+int hc_cuCtxSynchronize        (void *hashcat_ctx);
+int hc_cuDeviceGetAttribute    (void *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev);
+int hc_cuDeviceGetCount        (void *hashcat_ctx, int *count);
+int hc_cuDeviceGet             (void *hashcat_ctx, CUdevice *device, int ordinal);
+int hc_cuDeviceGetName         (void *hashcat_ctx, char *name, int len, CUdevice dev);
+int hc_cuDeviceTotalMem        (void *hashcat_ctx, size_t *bytes, CUdevice dev);
+int hc_cuDriverGetVersion      (void *hashcat_ctx, int *driverVersion);
+int hc_cuEventCreate           (void *hashcat_ctx, CUevent *phEvent, unsigned int Flags);
+int hc_cuEventDestroy          (void *hashcat_ctx, CUevent hEvent);
+int hc_cuEventElapsedTime      (void *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd);
+int hc_cuEventQuery            (void *hashcat_ctx, CUevent hEvent);
+int hc_cuEventRecord           (void *hashcat_ctx, CUevent hEvent, CUstream hStream);
+int hc_cuEventSynchronize      (void *hashcat_ctx, CUevent hEvent);
+int hc_cuFuncGetAttribute      (void *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+int hc_cuFuncSetAttribute      (void *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value);
+int hc_cuInit                  (void *hashcat_ctx, unsigned int Flags);
+int hc_cuLaunchKernel          (void *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+int hc_cuMemAlloc              (void *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
+int hc_cuMemcpyDtoDAsync       (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+int hc_cuMemcpyDtoHAsync       (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+int hc_cuMemcpyHtoDAsync       (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+int hc_cuMemFree               (void *hashcat_ctx, CUdeviceptr dptr);
+int hc_cuMemGetInfo            (void *hashcat_ctx, size_t *free, size_t *total);
+int hc_cuMemsetD32Async        (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+int hc_cuMemsetD8Async         (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+int hc_cuModuleGetFunction     (void *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
+int hc_cuModuleGetGlobal       (void *hashcat_ctx, CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
+int hc_cuModuleLoadDataEx      (void *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+int hc_cuModuleUnload          (void *hashcat_ctx, CUmodule hmod);
+int hc_cuStreamCreate          (void *hashcat_ctx, CUstream *phStream, unsigned int Flags);
+int hc_cuStreamDestroy         (void *hashcat_ctx, CUstream hStream);
+int hc_cuStreamSynchronize     (void *hashcat_ctx, CUstream hStream);
+int hc_cuCtxPushCurrent        (void *hashcat_ctx, CUcontext ctx);
+int hc_cuCtxPopCurrent         (void *hashcat_ctx, CUcontext *pctx);
+int hc_cuLinkCreate            (void *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+int hc_cuLinkAddData           (void *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+int hc_cuLinkDestroy           (void *hashcat_ctx, CUlinkState state);
+int hc_cuLinkComplete          (void *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut);
+
 #endif // _EXT_CUDA_H
--- a/include/ext_nvrtc.h
+++ b/include/ext_nvrtc.h
@ -84,4 +84,16 @@ typedef hc_nvrtc_lib_t NVRTC_PTR;

 int nvrtc_make_options_array_from_string (char *string, char **options);

+int  nvrtc_init                (void *hashcat_ctx);
+void nvrtc_close               (void *hashcat_ctx);
+
+int hc_nvrtcCreateProgram      (void *hashcat_ctx, nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames);
+int hc_nvrtcDestroyProgram     (void *hashcat_ctx, nvrtcProgram *prog);
+int hc_nvrtcCompileProgram     (void *hashcat_ctx, nvrtcProgram prog, int numOptions, const char * const *options);
+int hc_nvrtcGetProgramLogSize  (void *hashcat_ctx, nvrtcProgram prog, size_t *logSizeRet);
+int hc_nvrtcGetProgramLog      (void *hashcat_ctx, nvrtcProgram prog, char *log);
+int hc_nvrtcGetPTXSize         (void *hashcat_ctx, nvrtcProgram prog, size_t *ptxSizeRet);
+int hc_nvrtcGetPTX             (void *hashcat_ctx, nvrtcProgram prog, char *ptx);
+int hc_nvrtcVersion            (void *hashcat_ctx, int *major, int *minor);
+
 #endif // _EXT_NVRTC_H
--- a/src/backend.c
+++ b/src/backend.c
--- a/src/ext_cuda.c
+++ b/src/ext_cuda.c
--- a/src/ext_nvrtc.c
+++ b/src/ext_nvrtc.c
@ -5,8 +5,12 @@

 #include "common.h"
 #include "types.h"
+#include "memory.h"
+#include "event.h"
 #include "ext_nvrtc.h"

+#include "dynloader.h"
+
 int nvrtc_make_options_array_from_string (char *string, char **options)
 {
  char *saveptr = NULL;
@ -25,3 +29,238 @@ int nvrtc_make_options_array_from_string (char *string, char **options)

  return cnt;
 }
+
+// NVRTC
+
+int nvrtc_init (void *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  memset (nvrtc, 0, sizeof (NVRTC_PTR));
+
+  #if   defined (_WIN)
+  nvrtc->lib = hc_dlopen ("nvrtc.dll");
+
+  if (nvrtc->lib == NULL)
+  {
+    // super annoying: nvidia is using the CUDA version in nvrtc???.dll filename!
+    // however, the cuda version string comes from nvcuda.dll which is from nvidia driver, but
+    // the driver version and the installed CUDA toolkit version can be different, so it cannot be used as a reference.
+    // brute force to the rescue
+
+    char dllname[100];
+
+    for (int major = 20; major >= 9; major--) // older than 3.x do not ship _v2 functions anyway
+                                              // older than 7.x does not support sm 5.x
+                                              // older than 8.x does not have documentation archive online, no way to check if nvrtc support whatever we need
+                                              // older than 9.x is just a theoretical limit since we define 9.0 as the minimum required version
+    {
+      for (int minor = 20; minor >= 0; minor--)
+      {
+        snprintf (dllname, sizeof (dllname), "nvrtc64_%d%d.dll", major, minor);
+
+        nvrtc->lib = hc_dlopen (dllname);
+
+        if (nvrtc->lib) break;
+
+        snprintf (dllname, sizeof (dllname), "nvrtc64_%d%d_0.dll", major, minor);
+
+        nvrtc->lib = hc_dlopen (dllname);
+
+        if (nvrtc->lib) break;
+      }
+
+      if (nvrtc->lib) break;
+    }
+  }
+  #elif defined (__APPLE__)
+  nvrtc->lib = hc_dlopen ("nvrtc.dylib");
+  #elif defined (__CYGWIN__)
+  nvrtc->lib = hc_dlopen ("nvrtc.dll");
+  #else
+  nvrtc->lib = hc_dlopen ("libnvrtc.so");
+
+  if (nvrtc->lib == NULL) nvrtc->lib = hc_dlopen ("libnvrtc.so.1");
+  #endif
+
+  if (nvrtc->lib == NULL) return -1;
+
+  HC_LOAD_FUNC (nvrtc, nvrtcAddNameExpression,  NVRTC_NVRTCADDNAMEEXPRESSION, NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcCompileProgram,     NVRTC_NVRTCCOMPILEPROGRAM,    NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcCreateProgram,      NVRTC_NVRTCCREATEPROGRAM,     NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcDestroyProgram,     NVRTC_NVRTCDESTROYPROGRAM,    NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetLoweredName,     NVRTC_NVRTCGETLOWEREDNAME,    NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetPTX,             NVRTC_NVRTCGETPTX,            NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetPTXSize,         NVRTC_NVRTCGETPTXSIZE,        NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetProgramLog,      NVRTC_NVRTCGETPROGRAMLOG,     NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetProgramLogSize,  NVRTC_NVRTCGETPROGRAMLOGSIZE, NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcGetErrorString,     NVRTC_NVRTCGETERRORSTRING,    NVRTC, 1);
+  HC_LOAD_FUNC (nvrtc, nvrtcVersion,            NVRTC_NVRTCVERSION,           NVRTC, 1);
+
+  return 0;
+}
+
+void nvrtc_close (void *hashcat_ctx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  if (nvrtc)
+  {
+    if (nvrtc->lib)
+    {
+      hc_dlclose (nvrtc->lib);
+    }
+
+    hcfree (backend_ctx->nvrtc);
+
+    backend_ctx->nvrtc = NULL;
+  }
+}
+
+int hc_nvrtcCreateProgram (void *hashcat_ctx, nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcCreateProgram (prog, src, name, numHeaders, headers, includeNames);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcCreateProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcDestroyProgram (void *hashcat_ctx, nvrtcProgram *prog)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcDestroyProgram (prog);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcDestroyProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcCompileProgram (void *hashcat_ctx, nvrtcProgram prog, int numOptions, const char * const *options)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcCompileProgram (prog, numOptions, options);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcCompileProgram(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetProgramLogSize (void *hashcat_ctx, nvrtcProgram prog, size_t *logSizeRet)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetProgramLogSize (prog, logSizeRet);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetProgramLogSize(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetProgramLog (void *hashcat_ctx, nvrtcProgram prog, char *log)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetProgramLog (prog, log);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetProgramLog(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetPTXSize (void *hashcat_ctx, nvrtcProgram prog, size_t *ptxSizeRet)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetPTXSize (prog, ptxSizeRet);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetPTXSize(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcGetPTX (void *hashcat_ctx, nvrtcProgram prog, char *ptx)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcGetPTX (prog, ptx);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcGetPTX(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_nvrtcVersion (void *hashcat_ctx, int *major, int *minor)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  NVRTC_PTR *nvrtc = (NVRTC_PTR *) backend_ctx->nvrtc;
+
+  const nvrtcResult NVRTC_err = nvrtc->nvrtcVersion (major, minor);
+
+  if (NVRTC_err != NVRTC_SUCCESS)
+  {
+    event_log_error (hashcat_ctx, "nvrtcVersion(): %s", nvrtc->nvrtcGetErrorString (NVRTC_err));
+
+    return -1;
+  }
+
+  return 0;
+}