mirror of
https://github.com/hashcat/hashcat.git
synced 2025-01-08 23:01:14 +00:00
Add async CUDA memcpy functions: hc_cuMemcpyDtoDAsync(), hc_cuMemcpyDtoHAsync() and hc_cuMemcpyHtoDAsync(). Implement partially async CUDA memset and bzero kernels.
This commit is contained in:
parent
f07ff6f03d
commit
4263cafdcf
@ -69,8 +69,11 @@ int hc_cuInit (hashcat_ctx_t *hashcat_ctx, unsigned int Flags
|
||||
int hc_cuLaunchKernel (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
|
||||
int hc_cuMemAlloc (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
|
||||
int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
|
||||
int hc_cuMemcpyDtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
|
||||
int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
|
||||
int hc_cuMemcpyDtoHAsync (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
|
||||
int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
|
||||
int hc_cuMemcpyHtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
|
||||
int hc_cuMemFree (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr);
|
||||
int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
|
||||
int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
|
||||
|
@ -1029,8 +1029,11 @@ typedef CUresult (CUDA_API_CALL *CUDA_CULAUNCHKERNEL) (CUfunction, uns
|
||||
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOC) (CUdeviceptr *, size_t);
|
||||
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOCHOST) (void **, size_t);
|
||||
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOD) (CUdeviceptr, CUdeviceptr, size_t);
|
||||
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTODASYNC) (CUdeviceptr, CUdeviceptr, size_t, CUstream);
|
||||
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOH) (void *, CUdeviceptr, size_t);
|
||||
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOHASYNC) (void *, CUdeviceptr, size_t, CUstream);
|
||||
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTOD) (CUdeviceptr, const void *, size_t);
|
||||
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTODASYNC) (CUdeviceptr, const void *, size_t, CUstream);
|
||||
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREE) (CUdeviceptr);
|
||||
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREEHOST) (void *);
|
||||
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMGETINFO) (size_t *, size_t *);
|
||||
@ -1091,8 +1094,11 @@ typedef struct hc_cuda_lib
|
||||
CUDA_CUMEMALLOC cuMemAlloc;
|
||||
CUDA_CUMEMALLOCHOST cuMemAllocHost;
|
||||
CUDA_CUMEMCPYDTOD cuMemcpyDtoD;
|
||||
CUDA_CUMEMCPYDTODASYNC cuMemcpyDtoDAsync;
|
||||
CUDA_CUMEMCPYDTOH cuMemcpyDtoH;
|
||||
CUDA_CUMEMCPYDTOHASYNC cuMemcpyDtoHAsync;
|
||||
CUDA_CUMEMCPYHTOD cuMemcpyHtoD;
|
||||
CUDA_CUMEMCPYHTODASYNC cuMemcpyHtoDAsync;
|
||||
CUDA_CUMEMFREE cuMemFree;
|
||||
CUDA_CUMEMFREEHOST cuMemFreeHost;
|
||||
CUDA_CUMEMGETINFO cuMemGetInfo;
|
||||
|
108
src/backend.c
108
src/backend.c
@ -1255,8 +1255,11 @@ int cuda_init (hashcat_ctx_t *hashcat_ctx)
|
||||
HC_LOAD_FUNC_CUDA (cuda, cuMemAlloc, cuMemAlloc_v2, CUDA_CUMEMALLOC, CUDA, 1);
|
||||
HC_LOAD_FUNC_CUDA (cuda, cuMemAllocHost, cuMemAllocHost_v2, CUDA_CUMEMALLOCHOST, CUDA, 1);
|
||||
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoD, cuMemcpyDtoD_v2, CUDA_CUMEMCPYDTOD, CUDA, 1);
|
||||
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoDAsync, cuMemcpyDtoDAsync_v2, CUDA_CUMEMCPYDTODASYNC, CUDA, 1);
|
||||
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoH, cuMemcpyDtoH_v2, CUDA_CUMEMCPYDTOH, CUDA, 1);
|
||||
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoHAsync, cuMemcpyDtoHAsync_v2, CUDA_CUMEMCPYDTOHASYNC, CUDA, 1);
|
||||
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoD, cuMemcpyHtoD_v2, CUDA_CUMEMCPYHTOD, CUDA, 1);
|
||||
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoDAsync, cuMemcpyHtoDAsync_v2, CUDA_CUMEMCPYHTODASYNC, CUDA, 1);
|
||||
HC_LOAD_FUNC_CUDA (cuda, cuMemFree, cuMemFree_v2, CUDA_CUMEMFREE, CUDA, 1);
|
||||
HC_LOAD_FUNC_CUDA (cuda, cuMemFreeHost, cuMemFreeHost, CUDA_CUMEMFREEHOST, CUDA, 1);
|
||||
HC_LOAD_FUNC_CUDA (cuda, cuMemGetInfo, cuMemGetInfo_v2, CUDA_CUMEMGETINFO, CUDA, 1);
|
||||
@ -1708,6 +1711,33 @@ int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcD
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hc_cuMemcpyDtoHAsync (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
|
||||
{
|
||||
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
|
||||
|
||||
CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
|
||||
|
||||
const CUresult CU_err = cuda->cuMemcpyDtoHAsync (dstHost, srcDevice, ByteCount, hStream);
|
||||
|
||||
if (CU_err != CUDA_SUCCESS)
|
||||
{
|
||||
const char *pStr = NULL;
|
||||
|
||||
if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
|
||||
{
|
||||
event_log_error (hashcat_ctx, "cuMemcpyDtoHAsync(): %s", pStr);
|
||||
}
|
||||
else
|
||||
{
|
||||
event_log_error (hashcat_ctx, "cuMemcpyDtoHAsync(): %d", CU_err);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount)
|
||||
{
|
||||
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
|
||||
@ -1735,6 +1765,33 @@ int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdevice
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hc_cuMemcpyDtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
|
||||
{
|
||||
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
|
||||
|
||||
CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
|
||||
|
||||
const CUresult CU_err = cuda->cuMemcpyDtoDAsync (dstDevice, srcDevice, ByteCount, hStream);
|
||||
|
||||
if (CU_err != CUDA_SUCCESS)
|
||||
{
|
||||
const char *pStr = NULL;
|
||||
|
||||
if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
|
||||
{
|
||||
event_log_error (hashcat_ctx, "cuMemcpyDtoDAsync(): %s", pStr);
|
||||
}
|
||||
else
|
||||
{
|
||||
event_log_error (hashcat_ctx, "cuMemcpyDtoDAsync(): %d", CU_err);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
|
||||
{
|
||||
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
|
||||
@ -1762,6 +1819,33 @@ int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const vo
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hc_cuMemcpyHtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream)
|
||||
{
|
||||
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
|
||||
|
||||
CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
|
||||
|
||||
const CUresult CU_err = cuda->cuMemcpyHtoDAsync (dstDevice, srcHost, ByteCount, hStream);
|
||||
|
||||
if (CU_err != CUDA_SUCCESS)
|
||||
{
|
||||
const char *pStr = NULL;
|
||||
|
||||
if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
|
||||
{
|
||||
event_log_error (hashcat_ctx, "cuMemcpyHtoDAsync(): %s", pStr);
|
||||
}
|
||||
else
|
||||
{
|
||||
event_log_error (hashcat_ctx, "cuMemcpyHtoDAsync(): %d", CU_err);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name)
|
||||
{
|
||||
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
|
||||
@ -4878,7 +4962,7 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
|
||||
|
||||
if (device_param->is_cuda == true)
|
||||
{
|
||||
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tmps, device_param->size_tmps) == -1) return -1;
|
||||
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tmps, device_param->size_tmps) == -1) return -1;
|
||||
}
|
||||
|
||||
if (device_param->is_hip == true)
|
||||
@ -4896,7 +4980,7 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
|
||||
{
|
||||
if (device_param->is_cuda == true)
|
||||
{
|
||||
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
|
||||
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
|
||||
}
|
||||
|
||||
if (device_param->is_hip == true)
|
||||
@ -5009,6 +5093,7 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic
|
||||
{
|
||||
const u64 num16d = size / 16;
|
||||
const u64 num16m = size % 16;
|
||||
u32 tmp[4];
|
||||
|
||||
if (num16d)
|
||||
{
|
||||
@ -5023,24 +5108,20 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic
|
||||
CUfunction function = device_param->cuda_function_memset;
|
||||
|
||||
if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_memset, NULL) == -1) return -1;
|
||||
|
||||
if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
|
||||
}
|
||||
|
||||
if (num16m)
|
||||
{
|
||||
u32 tmp[4];
|
||||
|
||||
tmp[0] = value;
|
||||
tmp[1] = value;
|
||||
tmp[2] = value;
|
||||
tmp[3] = value;
|
||||
|
||||
// Apparently are allowed to do this: https://devtalk.nvidia.com/default/topic/761515/how-to-copy-to-device-memory-with-offset-/
|
||||
|
||||
if (hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1;
|
||||
if (hc_cuMemcpyHtoDAsync (hashcat_ctx, buf + (num16d * 16), tmp, num16m, device_param->cuda_stream) == -1) return -1;
|
||||
}
|
||||
|
||||
if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -5048,6 +5129,7 @@ int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device
|
||||
{
|
||||
const u64 num16d = size / 16;
|
||||
const u64 num16m = size % 16;
|
||||
u32 tmp[4];
|
||||
|
||||
if (num16d)
|
||||
{
|
||||
@ -5061,22 +5143,20 @@ int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device
|
||||
CUfunction function = device_param->cuda_function_bzero;
|
||||
|
||||
if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_bzero, NULL) == -1) return -1;
|
||||
|
||||
if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
|
||||
}
|
||||
|
||||
if (num16m)
|
||||
{
|
||||
u32 tmp[4];
|
||||
|
||||
tmp[0] = 0;
|
||||
tmp[1] = 0;
|
||||
tmp[2] = 0;
|
||||
tmp[3] = 0;
|
||||
|
||||
if (hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1;
|
||||
if (hc_cuMemcpyHtoDAsync (hashcat_ctx, buf + (num16d * 16), tmp, num16m, device_param->cuda_stream) == -1) return -1;
|
||||
}
|
||||
|
||||
if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user