Add async CUDA memcpy functions: hc_cuMemcpyDtoDAsync(), hc_cuMemcpyDtoHAsync() and hc_cuMemcpyHtoDAsync(). Implement partially async CUDA memset and bzero kernels.

pull/2879/head
Jukka Ojanen 3 years ago
parent f07ff6f03d
commit 4263cafdcf

@ -69,8 +69,11 @@ int hc_cuInit (hashcat_ctx_t *hashcat_ctx, unsigned int Flags
int hc_cuLaunchKernel (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
int hc_cuMemAlloc (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
int hc_cuMemcpyDtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
int hc_cuMemcpyDtoHAsync (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
int hc_cuMemcpyHtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
int hc_cuMemFree (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr);
int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);

@ -1029,8 +1029,11 @@ typedef CUresult (CUDA_API_CALL *CUDA_CULAUNCHKERNEL) (CUfunction, uns
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOC) (CUdeviceptr *, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOCHOST) (void **, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOD) (CUdeviceptr, CUdeviceptr, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTODASYNC) (CUdeviceptr, CUdeviceptr, size_t, CUstream);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOH) (void *, CUdeviceptr, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOHASYNC) (void *, CUdeviceptr, size_t, CUstream);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTOD) (CUdeviceptr, const void *, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTODASYNC) (CUdeviceptr, const void *, size_t, CUstream);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREE) (CUdeviceptr);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREEHOST) (void *);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMGETINFO) (size_t *, size_t *);
@ -1091,8 +1094,11 @@ typedef struct hc_cuda_lib
CUDA_CUMEMALLOC cuMemAlloc;
CUDA_CUMEMALLOCHOST cuMemAllocHost;
CUDA_CUMEMCPYDTOD cuMemcpyDtoD;
CUDA_CUMEMCPYDTODASYNC cuMemcpyDtoDAsync;
CUDA_CUMEMCPYDTOH cuMemcpyDtoH;
CUDA_CUMEMCPYDTOHASYNC cuMemcpyDtoHAsync;
CUDA_CUMEMCPYHTOD cuMemcpyHtoD;
CUDA_CUMEMCPYHTODASYNC cuMemcpyHtoDAsync;
CUDA_CUMEMFREE cuMemFree;
CUDA_CUMEMFREEHOST cuMemFreeHost;
CUDA_CUMEMGETINFO cuMemGetInfo;

@ -1255,8 +1255,11 @@ int cuda_init (hashcat_ctx_t *hashcat_ctx)
HC_LOAD_FUNC_CUDA (cuda, cuMemAlloc, cuMemAlloc_v2, CUDA_CUMEMALLOC, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuMemAllocHost, cuMemAllocHost_v2, CUDA_CUMEMALLOCHOST, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoD, cuMemcpyDtoD_v2, CUDA_CUMEMCPYDTOD, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoDAsync, cuMemcpyDtoDAsync_v2, CUDA_CUMEMCPYDTODASYNC, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoH, cuMemcpyDtoH_v2, CUDA_CUMEMCPYDTOH, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoHAsync, cuMemcpyDtoHAsync_v2, CUDA_CUMEMCPYDTOHASYNC, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoD, cuMemcpyHtoD_v2, CUDA_CUMEMCPYHTOD, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoDAsync, cuMemcpyHtoDAsync_v2, CUDA_CUMEMCPYHTODASYNC, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuMemFree, cuMemFree_v2, CUDA_CUMEMFREE, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuMemFreeHost, cuMemFreeHost, CUDA_CUMEMFREEHOST, CUDA, 1);
HC_LOAD_FUNC_CUDA (cuda, cuMemGetInfo, cuMemGetInfo_v2, CUDA_CUMEMGETINFO, CUDA, 1);
@ -1708,6 +1711,33 @@ int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcD
return 0;
}
int hc_cuMemcpyDtoHAsync (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
{
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
const CUresult CU_err = cuda->cuMemcpyDtoHAsync (dstHost, srcDevice, ByteCount, hStream);
if (CU_err != CUDA_SUCCESS)
{
const char *pStr = NULL;
if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
{
event_log_error (hashcat_ctx, "cuMemcpyDtoHAsync(): %s", pStr);
}
else
{
event_log_error (hashcat_ctx, "cuMemcpyDtoHAsync(): %d", CU_err);
}
return -1;
}
return 0;
}
int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount)
{
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@ -1735,6 +1765,33 @@ int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdevice
return 0;
}
int hc_cuMemcpyDtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
{
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
const CUresult CU_err = cuda->cuMemcpyDtoDAsync (dstDevice, srcDevice, ByteCount, hStream);
if (CU_err != CUDA_SUCCESS)
{
const char *pStr = NULL;
if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
{
event_log_error (hashcat_ctx, "cuMemcpyDtoDAsync(): %s", pStr);
}
else
{
event_log_error (hashcat_ctx, "cuMemcpyDtoDAsync(): %d", CU_err);
}
return -1;
}
return 0;
}
int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
{
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@ -1762,6 +1819,33 @@ int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const vo
return 0;
}
int hc_cuMemcpyHtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream)
{
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
const CUresult CU_err = cuda->cuMemcpyHtoDAsync (dstDevice, srcHost, ByteCount, hStream);
if (CU_err != CUDA_SUCCESS)
{
const char *pStr = NULL;
if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
{
event_log_error (hashcat_ctx, "cuMemcpyHtoDAsync(): %s", pStr);
}
else
{
event_log_error (hashcat_ctx, "cuMemcpyHtoDAsync(): %d", CU_err);
}
return -1;
}
return 0;
}
int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name)
{
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@ -4878,7 +4962,7 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
if (device_param->is_cuda == true)
{
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tmps, device_param->size_tmps) == -1) return -1;
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tmps, device_param->size_tmps) == -1) return -1;
}
if (device_param->is_hip == true)
@ -4896,7 +4980,7 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
{
if (device_param->is_cuda == true)
{
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
}
if (device_param->is_hip == true)
@ -5009,6 +5093,7 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic
{
const u64 num16d = size / 16;
const u64 num16m = size % 16;
u32 tmp[4];
if (num16d)
{
@ -5023,24 +5108,20 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic
CUfunction function = device_param->cuda_function_memset;
if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_memset, NULL) == -1) return -1;
if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
}
if (num16m)
{
u32 tmp[4];
tmp[0] = value;
tmp[1] = value;
tmp[2] = value;
tmp[3] = value;
// Apparently are allowed to do this: https://devtalk.nvidia.com/default/topic/761515/how-to-copy-to-device-memory-with-offset-/
if (hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1;
if (hc_cuMemcpyHtoDAsync (hashcat_ctx, buf + (num16d * 16), tmp, num16m, device_param->cuda_stream) == -1) return -1;
}
if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
return 0;
}
@ -5048,6 +5129,7 @@ int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device
{
const u64 num16d = size / 16;
const u64 num16m = size % 16;
u32 tmp[4];
if (num16d)
{
@ -5061,22 +5143,20 @@ int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device
CUfunction function = device_param->cuda_function_bzero;
if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_bzero, NULL) == -1) return -1;
if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
}
if (num16m)
{
u32 tmp[4];
tmp[0] = 0;
tmp[1] = 0;
tmp[2] = 0;
tmp[3] = 0;
if (hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1;
if (hc_cuMemcpyHtoDAsync (hashcat_ctx, buf + (num16d * 16), tmp, num16m, device_param->cuda_stream) == -1) return -1;
}
if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
return 0;
}

Loading…
Cancel
Save