1
0
mirror of https://github.com/hashcat/hashcat.git synced 2025-01-08 23:01:14 +00:00

Cache cubin instead of PTX to decrease startup time

This commit is contained in:
Jens Steube 2020-01-29 15:56:36 +01:00
parent cc4fd48ace
commit 66ae5125ce
3 changed files with 247 additions and 13 deletions

View File

@ -75,6 +75,10 @@ int hc_cuStreamDestroy (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
int hc_cuStreamSynchronize (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
int hc_cuCtxPushCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
int hc_cuCtxPopCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx);
int hc_cuLinkCreate (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
int hc_cuLinkAddData (hashcat_ctx_t *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
int hc_cuLinkDestroy (hashcat_ctx_t *hashcat_ctx, CUlinkState state);
int hc_cuLinkComplete (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut);
int hc_clBuildProgram (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data);
int hc_clCreateBuffer (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_mem *mem);

View File

@ -32,6 +32,7 @@ typedef struct CUevent_st *CUevent; /**< CUDA event */
typedef struct CUfunc_st *CUfunction; /**< CUDA function */
typedef struct CUmod_st *CUmodule; /**< CUDA module */
typedef struct CUstream_st *CUstream; /**< CUDA stream */
typedef struct CUlinkState_st *CUlinkState;
typedef enum cudaError_enum {
/**
@ -951,6 +952,41 @@ typedef enum CUevent_flags_enum {
CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
} CUevent_flags;
typedef enum CUjitInputType_enum
{
/**
* Compiled device-class-specific device code\n
* Applicable options: none
*/
CU_JIT_INPUT_CUBIN = 0,
/**
* PTX source code\n
* Applicable options: PTX compiler options
*/
CU_JIT_INPUT_PTX,
/**
* Bundle of multiple cubins and/or PTX of some device code\n
* Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
*/
CU_JIT_INPUT_FATBINARY,
/**
* Host object with embedded device code\n
* Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
*/
CU_JIT_INPUT_OBJECT,
/**
* Archive of host objects with embedded device code\n
* Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
*/
CU_JIT_INPUT_LIBRARY,
CU_JIT_NUM_INPUT_TYPES
} CUjitInputType;
#ifdef _WIN32
#define CUDAAPI __stdcall
#else
@ -1012,6 +1048,10 @@ typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMCREATE) (CUstream *, uns
typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMDESTROY) (CUstream);
typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMSYNCHRONIZE) (CUstream);
typedef CUresult (CUDA_API_CALL *CUDA_CUSTREAMWAITEVENT) (CUstream, CUevent, unsigned int);
typedef CUresult (CUDA_API_CALL *CUDA_CULINKCREATE) (unsigned int, CUjit_option *, void **, CUlinkState *);
typedef CUresult (CUDA_API_CALL *CUDA_CULINKADDDATA) (CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned int, CUjit_option *, void **);
typedef CUresult (CUDA_API_CALL *CUDA_CULINKDESTROY) (CUlinkState);
typedef CUresult (CUDA_API_CALL *CUDA_CULINKCOMPLETE) (CUlinkState, void **, size_t *);
typedef struct hc_cuda_lib
{
@ -1070,6 +1110,10 @@ typedef struct hc_cuda_lib
CUDA_CUSTREAMDESTROY cuStreamDestroy;
CUDA_CUSTREAMSYNCHRONIZE cuStreamSynchronize;
CUDA_CUSTREAMWAITEVENT cuStreamWaitEvent;
CUDA_CULINKCREATE cuLinkCreate;
CUDA_CULINKADDDATA cuLinkAddData;
CUDA_CULINKDESTROY cuLinkDestroy;
CUDA_CULINKCOMPLETE cuLinkComplete;
} hc_cuda_lib_t;

View File

@ -998,6 +998,10 @@ int cuda_init (hashcat_ctx_t *hashcat_ctx)
HC_LOAD_FUNC_CUDA (cuda, cuStreamDestroy, cuStreamDestroy_v2, CUDA_CUSTREAMDESTROY, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuStreamSynchronize, cuStreamSynchronize, CUDA_CUSTREAMSYNCHRONIZE, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuStreamWaitEvent, cuStreamWaitEvent, CUDA_CUSTREAMWAITEVENT, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuLinkCreate, cuLinkCreate_v2, CUDA_CULINKCREATE, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuLinkAddData, cuLinkAddData_v2, CUDA_CULINKADDDATA, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuLinkDestroy, cuLinkDestroy, CUDA_CULINKDESTROY, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuLinkComplete, cuLinkComplete, CUDA_CULINKCOMPLETE, CUDA, 1);
return 0;
}
@ -2040,6 +2044,113 @@ int hc_cuCtxPopCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx)
return 0;
}
int hc_cuLinkCreate (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut)
{
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
const CUresult CU_err = cuda->cuLinkCreate (numOptions, options, optionValues, stateOut);
if (CU_err != CUDA_SUCCESS)
{
const char *pStr = NULL;
if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
{
event_log_error (hashcat_ctx, "cuLinkCreate(): %s", pStr);
}
else
{
event_log_error (hashcat_ctx, "cuLinkCreate(): %d", CU_err);
}
return -1;
}
return 0;
}
int hc_cuLinkAddData (hashcat_ctx_t *hashcat_ctx, CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues)
{
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
const CUresult CU_err = cuda->cuLinkAddData (state, type, data, size, name, numOptions, options, optionValues);
if (CU_err != CUDA_SUCCESS)
{
const char *pStr = NULL;
if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
{
event_log_error (hashcat_ctx, "cuLinkAddData(): %s", pStr);
}
else
{
event_log_error (hashcat_ctx, "cuLinkAddData(): %d", CU_err);
}
return -1;
}
return 0;
}
int hc_cuLinkDestroy (hashcat_ctx_t *hashcat_ctx, CUlinkState state)
{
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
const CUresult CU_err = cuda->cuLinkDestroy (state);
if (CU_err != CUDA_SUCCESS)
{
const char *pStr = NULL;
if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
{
event_log_error (hashcat_ctx, "cuLinkDestroy(): %s", pStr);
}
else
{
event_log_error (hashcat_ctx, "cuLinkDestroy(): %d", CU_err);
}
return -1;
}
return 0;
}
int hc_cuLinkComplete (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut)
{
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
const CUresult CU_err = cuda->cuLinkComplete (state, cubinOut, sizeOut);
if (CU_err != CUDA_SUCCESS)
{
const char *pStr = NULL;
if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
{
event_log_error (hashcat_ctx, "cuLinkComplete(): %s", pStr);
}
else
{
event_log_error (hashcat_ctx, "cuLinkComplete(): %d", CU_err);
}
return -1;
}
return 0;
}
// OpenCL
@ -7438,18 +7549,41 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
if (hc_nvrtcDestroyProgram (hashcat_ctx, &program) == -1) return -1;
const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, binary);
CUlinkState state;
if (rc_cuModuleLoadDataEx == -1) return -1;
if (hc_cuLinkCreate (hashcat_ctx, 0, NULL, NULL, &state) == -1) return -1;
if (hc_cuLinkAddData (hashcat_ctx, state, CU_JIT_INPUT_PTX, binary, binary_size, "kernel", 0, NULL, NULL) == -1) return -1;
void *cubin = NULL;
size_t cubin_size = 0;
if (hc_cuLinkComplete (hashcat_ctx, state, &cubin, &cubin_size) == -1) return -1;
#ifdef DEBUG
if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, binary) == -1) return -1;
if (cache_disable == false)
{
const bool rc_write = write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
if (rc_write == false) return -1;
if (write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size) == false) return -1;
}
#else
if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, cubin) == -1) return -1;
if (cache_disable == false)
{
if (write_kernel_binary (hashcat_ctx, cached_file, cubin, cubin_size) == false) return -1;
}
#endif
hcfree (binary);
if (hc_cuLinkDestroy (hashcat_ctx, state) == -1) return -1;
}
if (device_param->is_opencl == true)
@ -7662,20 +7796,41 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
if (hc_nvrtcDestroyProgram (hashcat_ctx, &program) == -1) return -1;
// tbd: check for some useful options
CUlinkState state;
const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, binary);
if (hc_cuLinkCreate (hashcat_ctx, 0, NULL, NULL, &state) == -1) return -1;
if (rc_cuModuleLoadDataEx == -1) return -1;
if (hc_cuLinkAddData (hashcat_ctx, state, CU_JIT_INPUT_PTX, binary, binary_size, "mp_kernel", 0, NULL, NULL) == -1) return -1;
void *cubin = NULL;
size_t cubin_size = 0;
if (hc_cuLinkComplete (hashcat_ctx, state, &cubin, &cubin_size) == -1) return -1;
#ifdef DEBUG
if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, binary) == -1) return -1;
if (cache_disable == false)
{
const bool rc_write = write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size);
if (rc_write == false) return -1;
if (write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size) == false) return -1;
}
#else
if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, cubin) == -1) return -1;
if (cache_disable == false)
{
if (write_kernel_binary (hashcat_ctx, cached_file, cubin, cubin_size) == false) return -1;
}
#endif
hcfree (binary);
if (hc_cuLinkDestroy (hashcat_ctx, state) == -1) return -1;
}
if (device_param->is_opencl == true)
@ -7836,7 +7991,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
{
nvrtcProgram program;
if (hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], "mp_kernel", 0, NULL, NULL) == -1) return -1;
if (hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], "amp_kernel", 0, NULL, NULL) == -1) return -1;
char **nvrtc_options = (char **) hccalloc (4 + strlen (build_options_buf) + 1, sizeof (char *)); // ...
@ -7893,7 +8048,25 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
if (hc_nvrtcDestroyProgram (hashcat_ctx, &program) == -1) return -1;
// tbd: check for some useful options
CUlinkState state;
const int rc_cuLinkCreate = hc_cuLinkCreate (hashcat_ctx, 0, NULL, NULL, &state);
if (rc_cuLinkCreate == -1) return -1;
const int rc_cuLinkAddData = hc_cuLinkAddData (hashcat_ctx, state, CU_JIT_INPUT_PTX, binary, binary_size, "kernel_amp", 0, NULL, NULL);
if (rc_cuLinkAddData == -1) return -1;
void *cubin = NULL;
size_t cubin_size = 0;
const int rc_cuLinkComplete = hc_cuLinkComplete (hashcat_ctx, state, &cubin, &cubin_size);
if (rc_cuLinkComplete == -1) return -1;
#ifdef DEBUG
if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, binary) == -1) return -1;
@ -7902,7 +8075,20 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
if (write_kernel_binary (hashcat_ctx, cached_file, binary, binary_size) == false) return -1;
}
#else
if (hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, cubin) == -1) return -1;
if (cache_disable == false)
{
if (write_kernel_binary (hashcat_ctx, cached_file, cubin, cubin_size) == false) return -1;
}
#endif
hcfree (binary);
if (hc_cuLinkDestroy (hashcat_ctx, state) == -1) return -1;
}
if (device_param->is_opencl == true)