diff --git a/include/backend.h b/include/backend.h index a3d997ca8..957ac229d 100644 --- a/include/backend.h +++ b/include/backend.h @@ -69,8 +69,11 @@ int hc_cuInit (hashcat_ctx_t *hashcat_ctx, unsigned int Flags int hc_cuLaunchKernel (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); int hc_cuMemAlloc (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize); int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); +int hc_cuMemcpyDtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); +int hc_cuMemcpyDtoHAsync (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); +int hc_cuMemcpyHtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); int hc_cuMemFree (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr); int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name); int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); diff --git a/include/ext_cuda.h b/include/ext_cuda.h index 0e3619fc4..8048c729f 100644 --- a/include/ext_cuda.h +++ b/include/ext_cuda.h @@ -1029,8 +1029,11 @@ typedef CUresult (CUDA_API_CALL *CUDA_CULAUNCHKERNEL) (CUfunction, uns typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOC) (CUdeviceptr *, size_t); typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOCHOST) (void **, size_t); typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOD) (CUdeviceptr, CUdeviceptr, size_t); +typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTODASYNC) (CUdeviceptr, CUdeviceptr, size_t, CUstream); typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOH) (void *, CUdeviceptr, size_t); +typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOHASYNC) (void *, CUdeviceptr, size_t, CUstream); typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTOD) (CUdeviceptr, const void *, size_t); +typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTODASYNC) (CUdeviceptr, const void *, size_t, CUstream); typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREE) (CUdeviceptr); typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREEHOST) (void *); typedef CUresult (CUDA_API_CALL *CUDA_CUMEMGETINFO) (size_t *, size_t *); @@ -1091,8 +1094,11 @@ typedef struct hc_cuda_lib CUDA_CUMEMALLOC cuMemAlloc; CUDA_CUMEMALLOCHOST cuMemAllocHost; CUDA_CUMEMCPYDTOD cuMemcpyDtoD; + CUDA_CUMEMCPYDTODASYNC cuMemcpyDtoDAsync; CUDA_CUMEMCPYDTOH cuMemcpyDtoH; + CUDA_CUMEMCPYDTOHASYNC cuMemcpyDtoHAsync; CUDA_CUMEMCPYHTOD cuMemcpyHtoD; + CUDA_CUMEMCPYHTODASYNC cuMemcpyHtoDAsync; CUDA_CUMEMFREE cuMemFree; CUDA_CUMEMFREEHOST cuMemFreeHost; CUDA_CUMEMGETINFO cuMemGetInfo; diff --git a/src/backend.c b/src/backend.c index cc7d53e2b..0277b2d5f 100644 --- a/src/backend.c +++ b/src/backend.c @@ -1255,8 +1255,11 @@ int cuda_init (hashcat_ctx_t *hashcat_ctx) HC_LOAD_FUNC_CUDA (cuda, cuMemAlloc, cuMemAlloc_v2, CUDA_CUMEMALLOC, CUDA, 1); HC_LOAD_FUNC_CUDA (cuda, cuMemAllocHost, cuMemAllocHost_v2, CUDA_CUMEMALLOCHOST, CUDA, 1); HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoD, cuMemcpyDtoD_v2, CUDA_CUMEMCPYDTOD, CUDA, 1); + HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoDAsync, cuMemcpyDtoDAsync_v2, CUDA_CUMEMCPYDTODASYNC, CUDA, 1); HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoH, cuMemcpyDtoH_v2, CUDA_CUMEMCPYDTOH, CUDA, 1); + HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoHAsync, cuMemcpyDtoHAsync_v2, CUDA_CUMEMCPYDTOHASYNC, CUDA, 1); HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoD, cuMemcpyHtoD_v2, CUDA_CUMEMCPYHTOD, CUDA, 1); + HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoDAsync, cuMemcpyHtoDAsync_v2, CUDA_CUMEMCPYHTODASYNC, CUDA, 1); HC_LOAD_FUNC_CUDA (cuda, cuMemFree, cuMemFree_v2, CUDA_CUMEMFREE, CUDA, 1); HC_LOAD_FUNC_CUDA (cuda, cuMemFreeHost, cuMemFreeHost, CUDA_CUMEMFREEHOST, CUDA, 1); HC_LOAD_FUNC_CUDA (cuda, cuMemGetInfo, cuMemGetInfo_v2, CUDA_CUMEMGETINFO, CUDA, 1); @@ -1708,6 +1711,33 @@ int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcD return 0; } +int hc_cuMemcpyDtoHAsync (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) +{ + backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; + + CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda; + + const CUresult CU_err = cuda->cuMemcpyDtoHAsync (dstHost, srcDevice, ByteCount, hStream); + + if (CU_err != CUDA_SUCCESS) + { + const char *pStr = NULL; + + if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS) + { + event_log_error (hashcat_ctx, "cuMemcpyDtoHAsync(): %s", pStr); + } + else + { + event_log_error (hashcat_ctx, "cuMemcpyDtoHAsync(): %d", CU_err); + } + + return -1; + } + + return 0; +} + int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount) { backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; @@ -1735,6 +1765,33 @@ int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdevice return 0; } +int hc_cuMemcpyDtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) +{ + backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; + + CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda; + + const CUresult CU_err = cuda->cuMemcpyDtoDAsync (dstDevice, srcDevice, ByteCount, hStream); + + if (CU_err != CUDA_SUCCESS) + { + const char *pStr = NULL; + + if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS) + { + event_log_error (hashcat_ctx, "cuMemcpyDtoDAsync(): %s", pStr); + } + else + { + event_log_error (hashcat_ctx, "cuMemcpyDtoDAsync(): %d", CU_err); + } + + return -1; + } + + return 0; +} + int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount) { backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; @@ -1762,6 +1819,33 @@ int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const vo return 0; } +int hc_cuMemcpyHtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream) +{ + backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; + + CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda; + + const CUresult CU_err = cuda->cuMemcpyHtoDAsync (dstDevice, srcHost, ByteCount, hStream); + + if (CU_err != CUDA_SUCCESS) + { + const char *pStr = NULL; + + if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS) + { + event_log_error (hashcat_ctx, "cuMemcpyHtoDAsync(): %s", pStr); + } + else + { + event_log_error (hashcat_ctx, "cuMemcpyHtoDAsync(): %d", CU_err); + } + + return -1; + } + + return 0; +} + int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name) { backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; @@ -4878,7 +4962,7 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, if (device_param->is_cuda == true) { - if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tmps, device_param->size_tmps) == -1) return -1; + if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tmps, device_param->size_tmps) == -1) return -1; } if (device_param->is_hip == true) @@ -4896,7 +4980,7 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, { if (device_param->is_cuda == true) { - if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1; + if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1; } if (device_param->is_hip == true) @@ -5009,6 +5093,7 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic { const u64 num16d = size / 16; const u64 num16m = size % 16; + u32 tmp[4]; if (num16d) { @@ -5023,24 +5108,20 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic CUfunction function = device_param->cuda_function_memset; if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_memset, NULL) == -1) return -1; - - if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } if (num16m) { - u32 tmp[4]; - tmp[0] = value; tmp[1] = value; tmp[2] = value; tmp[3] = value; - // Apparently are allowed to do this: https://devtalk.nvidia.com/default/topic/761515/how-to-copy-to-device-memory-with-offset-/ - - if (hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1; + if (hc_cuMemcpyHtoDAsync (hashcat_ctx, buf + (num16d * 16), tmp, num16m, device_param->cuda_stream) == -1) return -1; } + if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; + return 0; } @@ -5048,6 +5129,7 @@ int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device { const u64 num16d = size / 16; const u64 num16m = size % 16; + u32 tmp[4]; if (num16d) { @@ -5061,22 +5143,20 @@ int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device CUfunction function = device_param->cuda_function_bzero; if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_bzero, NULL) == -1) return -1; - - if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } if (num16m) { - u32 tmp[4]; - tmp[0] = 0; tmp[1] = 0; tmp[2] = 0; tmp[3] = 0; - if (hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1; + if (hc_cuMemcpyHtoDAsync (hashcat_ctx, buf + (num16d * 16), tmp, num16m, device_param->cuda_stream) == -1) return -1; } + if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; + return 0; }