Cache cubin instead of PTX to decrease startup time

2025-07-18 20:49:19 +00:00 · 2020-01-29 15:56:36 +01:00 · 2020-01-29 15:56:36 +01:00 · 66ae5125ce
commit 66ae5125ce
parent cc4fd48ace
3 changed files with 247 additions and 13 deletions
--- a/include/backend.h
+++ b/include/backend.h
@ -75,6 +75,10 @@ int hc_cuStreamDestroy           (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
 int hc_cuStreamSynchronize       (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
 int hc_cuCtxPushCurrent          (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
 int hc_cuCtxPopCurrent           (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx);
+int hc_cuLinkCreate              (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+int hc_cuLinkAddData             (hashcat_ctx_t *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+int hc_cuLinkDestroy             (hashcat_ctx_t *hashcat_ctx, CUlinkState state);
+int hc_cuLinkComplete            (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut);

 int hc_clBuildProgram            (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data);
 int hc_clCreateBuffer            (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_mem *mem);
--- a/include/ext_cuda.h
+++ b/include/ext_cuda.h
@ -32,6 +32,7 @@ typedef struct CUevent_st *CUevent;                       /**< CUDA event */
 typedef struct CUfunc_st *CUfunction;                     /**< CUDA function */
 typedef struct CUmod_st *CUmodule;                        /**< CUDA module */
 typedef struct CUstream_st *CUstream;                     /**< CUDA stream */
+typedef struct CUlinkState_st *CUlinkState;

 typedef enum cudaError_enum {
    /**
@ -951,6 +952,41 @@ typedef enum CUevent_flags_enum {
    CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
 } CUevent_flags;

+typedef enum CUjitInputType_enum
+{
+    /**
+     * Compiled device-class-specific device code\n
+     * Applicable options: none
+     */
+    CU_JIT_INPUT_CUBIN = 0,
+
+    /**
+     * PTX source code\n
+     * Applicable options: PTX compiler options
+     */
+    CU_JIT_INPUT_PTX,
+
+    /**
+     * Bundle of multiple cubins and/or PTX of some device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_FATBINARY,
+
+    /**
+     * Host object with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_OBJECT,
+
+    /**
+     * Archive of host objects with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_LIBRARY,
+
+    CU_JIT_NUM_INPUT_TYPES
+} CUjitInputType;
+
 #ifdef _WIN32
 #define CUDAAPI __stdcall
 #else
@ -1012,6 +1048,10 @@ typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMCREATE)           (CUstream *, uns
 typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMDESTROY)          (CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMSYNCHRONIZE)      (CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMWAITEVENT)        (CUstream, CUevent, unsigned int);
+typedef CUresult (CUDA_API_CALL *CUDA_CULINKCREATE)             (unsigned int, CUjit_option *, void **, CUlinkState *);
+typedef CUresult (CUDA_API_CALL *CUDA_CULINKADDDATA)            (CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned int, CUjit_option *, void **);
+typedef CUresult (CUDA_API_CALL *CUDA_CULINKDESTROY)            (CUlinkState);
+typedef CUresult (CUDA_API_CALL *CUDA_CULINKCOMPLETE)           (CUlinkState, void **, size_t *);

 typedef struct hc_cuda_lib
 {
@ -1070,6 +1110,10 @@ typedef struct hc_cuda_lib
  CUDA_CUSTREAMDESTROY          cuStreamDestroy;
  CUDA_CUSTREAMSYNCHRONIZE      cuStreamSynchronize;
  CUDA_CUSTREAMWAITEVENT        cuStreamWaitEvent;
+  CUDA_CULINKCREATE             cuLinkCreate;
+  CUDA_CULINKADDDATA            cuLinkAddData;
+  CUDA_CULINKDESTROY            cuLinkDestroy;
+  CUDA_CULINKCOMPLETE           cuLinkComplete;

 } hc_cuda_lib_t;

--- a/src/backend.c
+++ b/src/backend.c
@ -998,6 +998,10 @@ int cuda_init (hashcat_ctx_t *hashcat_ctx)
  HC_LOAD_FUNC_CUDA (cuda, cuStreamDestroy,          cuStreamDestroy_v2,        CUDA_CUSTREAMDESTROY,           CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuStreamSynchronize,      cuStreamSynchronize,       CUDA_CUSTREAMSYNCHRONIZE,       CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuStreamWaitEvent,        cuStreamWaitEvent,         CUDA_CUSTREAMWAITEVENT,         CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuLinkCreate,             cuLinkCreate_v2,           CUDA_CULINKCREATE,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuLinkAddData,            cuLinkAddData_v2,          CUDA_CULINKADDDATA,             CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuLinkDestroy,            cuLinkDestroy,             CUDA_CULINKDESTROY,             CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuLinkComplete,           cuLinkComplete,            CUDA_CULINKCOMPLETE,            CUDA, 1);

  return 0;
 }
@ -2040,6 +2044,113 @@ int hc_cuCtxPopCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx)
  return 0;
 }

+int hc_cuLinkCreate (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuLinkCreate (numOptions, options, optionValues, stateOut);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLinkCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLinkCreate(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuLinkAddData (hashcat_ctx_t *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuLinkAddData (state, type, data, size, name, numOptions, options, optionValues);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLinkAddData(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLinkAddData(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuLinkDestroy (hashcat_ctx_t *hashcat_ctx, CUlinkState state)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuLinkDestroy (state);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLinkDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLinkDestroy(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuLinkComplete (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuLinkComplete (state, cubinOut, sizeOut);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuLinkComplete(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuLinkComplete(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}

 // OpenCL

@ -7438,18 +7549,41 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

          if (hc_nvrtcDestroyProgram (hashcat_ctx, &program) == -1) return -1;

-          const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, binary);
+          CUlinkState state;

-          if (rc_cuModuleLoadDataEx == -1) return -1;
+          if (hc_cuLinkCreate (hashcat_ctx, 0, NULL, NULL, &state) == -1) return -1;
+
+          if (hc_cuLinkAddData (hashcat_ctx, state, CU_JIT_INPUT_PTX, binary, binary_size, "kernel", 0, NULL, NULL) == -1) return -1;
+
+          void *cubin = NULL;
+
+          size_t cubin_size = 0;
+
+          if (hc_cuLinkComplete (hashcat_ctx, state, &cubin, &cubin_size) == -1) return -1;
+
+          #ifdef DEBUG
+
+          if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, binary) == -1) return -1;

          if (cache_disable == false)
          {
-            const bool rc_write = write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
-
-            if (rc_write == false) return -1;
+            if (write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size) == false) return -1;
          }

+          #else
+
+          if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, cubin) == -1) return -1;
+
+          if (cache_disable == false)
+          {
+            if (write_kernel_binary (hashcat_ctx, cached_file, cubin, cubin_size) == false) return -1;
+          }
+
+          #endif
+
          hcfree (binary);
+
+          if (hc_cuLinkDestroy (hashcat_ctx, state) == -1) return -1;
        }

        if (device_param->is_opencl == true)
@ -7662,20 +7796,41 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

            if (hc_nvrtcDestroyProgram (hashcat_ctx, &program) == -1) return -1;

-            // tbd: check for some useful options
+            CUlinkState state;

-            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, binary);
+            if (hc_cuLinkCreate (hashcat_ctx, 0, NULL, NULL, &state) == -1) return -1;

-            if (rc_cuModuleLoadDataEx == -1) return -1;
+            if (hc_cuLinkAddData (hashcat_ctx, state, CU_JIT_INPUT_PTX, binary, binary_size, "mp_kernel", 0, NULL, NULL) == -1) return -1;
+
+            void *cubin = NULL;
+
+            size_t cubin_size = 0;
+
+            if (hc_cuLinkComplete (hashcat_ctx, state, &cubin, &cubin_size) == -1) return -1;
+
+            #ifdef DEBUG
+
+            if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, binary) == -1) return -1;

            if (cache_disable == false)
            {
-              const bool rc_write = write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
-
-              if (rc_write == false) return -1;
+              if (write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size) == false) return -1;
            }

+            #else
+
+            if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, cubin) == -1) return -1;
+
+            if (cache_disable == false)
+            {
+              if (write_kernel_binary (hashcat_ctx, cached_file, cubin, cubin_size) == false) return -1;
+            }
+
+            #endif
+
            hcfree (binary);
+
+            if (hc_cuLinkDestroy (hashcat_ctx, state) == -1) return -1;
          }

          if (device_param->is_opencl == true)
@ -7836,7 +7991,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
          {
            nvrtcProgram program;

-            if (hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], "mp_kernel", 0, NULL, NULL) == -1) return -1;
+            if (hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], "amp_kernel", 0, NULL, NULL) == -1) return -1;

            char **nvrtc_options = (char **) hccalloc (4 + strlen (build_options_buf) + 1, sizeof (char *)); // ...

@ -7893,7 +8048,25 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

            if (hc_nvrtcDestroyProgram (hashcat_ctx, &program) == -1) return -1;

-            // tbd: check for some useful options
+            CUlinkState state;
+
+            const int rc_cuLinkCreate = hc_cuLinkCreate (hashcat_ctx, 0, NULL, NULL, &state);
+
+            if (rc_cuLinkCreate == -1) return -1;
+
+            const int rc_cuLinkAddData = hc_cuLinkAddData (hashcat_ctx, state, CU_JIT_INPUT_PTX, binary, binary_size, "kernel_amp", 0, NULL, NULL);
+
+            if (rc_cuLinkAddData == -1) return -1;
+
+            void *cubin = NULL;
+
+            size_t cubin_size = 0;
+
+            const int rc_cuLinkComplete = hc_cuLinkComplete (hashcat_ctx, state, &cubin, &cubin_size);
+
+            if (rc_cuLinkComplete == -1) return -1;
+
+            #ifdef DEBUG

            if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, binary) == -1) return -1;

@ -7902,7 +8075,20 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
              if (write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size) == false) return -1;
            }

+            #else
+
+            if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, cubin) == -1) return -1;
+
+            if (cache_disable == false)
+            {
+              if (write_kernel_binary (hashcat_ctx, cached_file, cubin, cubin_size) == false) return -1;
+            }
+
+            #endif
+
            hcfree (binary);
+
+            if (hc_cuLinkDestroy (hashcat_ctx, state) == -1) return -1;
          }

          if (device_param->is_opencl == true)