OpenCL Runtime: Allow the kernel to access post-48k shared memory region on CUDA. Requires both module and kernel preparation

pull/2306/head
Jens Steube 4 years ago
parent 1fc37c25f9
commit aef53f7e10

@ -356,6 +356,10 @@ CONSTANT_VK u32a c_pbox[18] =
L ^= P[17]; \
}
#ifdef DYNAMIC_LOCAL
extern __shared__ u32 lm[];
#endif
DECLSPEC void expand_key (u32 *E, u32 *W, const int len)
{
u8 *E_ptr = (u8 *) E;
@ -456,6 +460,12 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS
P[i] = c_pbox[i];
}
#ifdef DYNAMIC_LOCAL
u32 *S0 = lm + (lid * 1024) + 0;
u32 *S1 = lm + (lid * 1024) + 256;
u32 *S2 = lm + (lid * 1024) + 512;
u32 *S3 = lm + (lid * 1024) + 768;
#else
LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
@ -465,6 +475,7 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS
LOCAL_AS u32 *S1 = S1_all[lid];
LOCAL_AS u32 *S2 = S2_all[lid];
LOCAL_AS u32 *S3 = S3_all[lid];
#endif
for (u32 i = 0; i < 256; i++)
{
@ -614,6 +625,12 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS
P[i] = tmps[gid].P[i];
}
#ifdef DYNAMIC_LOCAL
u32 *S0 = lm + (lid * 1024) + 0;
u32 *S1 = lm + (lid * 1024) + 256;
u32 *S2 = lm + (lid * 1024) + 512;
u32 *S3 = lm + (lid * 1024) + 768;
#else
LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
@ -623,6 +640,7 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS
LOCAL_AS u32 *S1 = S1_all[lid];
LOCAL_AS u32 *S2 = S2_all[lid];
LOCAL_AS u32 *S3 = S3_all[lid];
#endif
for (u32 i = 0; i < 256; i++)
{
@ -799,6 +817,12 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_comp (KERN_ATTR_TMPS
P[i] = tmps[gid].P[i];
}
#ifdef DYNAMIC_LOCAL
u32 *S0 = lm + (lid * 1024) + 0;
u32 *S1 = lm + (lid * 1024) + 256;
u32 *S2 = lm + (lid * 1024) + 512;
u32 *S3 = lm + (lid * 1024) + 768;
#else
LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
@ -808,6 +832,7 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_comp (KERN_ATTR_TMPS
LOCAL_AS u32 *S1 = S1_all[lid];
LOCAL_AS u32 *S2 = S2_all[lid];
LOCAL_AS u32 *S3 = S3_all[lid];
#endif
for (u32 i = 0; i < 256; i++)
{

@ -115,6 +115,7 @@
- OpenCL Runtime: Workaround JiT compiler error on ROCm 2.3 driver if the 'inline' keyword is used in function declaration
- OpenCL Runtime: Workaround memory allocation error on AMD driver on Windows leading to CL_MEM_OBJECT_ALLOCATION_FAILURE
- OpenCL Runtime: Workaround ROCm OpenCL driver problem trying to write temporary file into readonly folder by setting TMPDIR
- OpenCL Runtime: Allow the kernel to access post-48k shared memory region on CUDA. Requires both module and kernel preparation
- Startup Checks: Improved the pidfile check: Do not just check for existing PID but also check executable filename
- Startup Checks: Prevent the user to modify options which are overwritten automatically in benchmark mode
- Startup Screen: Add extra warning when using --force

@ -1132,6 +1132,27 @@ typedef struct hc_device_param
u64 kernel_local_mem_size_aux3;
u64 kernel_local_mem_size_aux4;
u64 kernel_dynamic_local_mem_size1;
u64 kernel_dynamic_local_mem_size12;
u64 kernel_dynamic_local_mem_size2;
u64 kernel_dynamic_local_mem_size23;
u64 kernel_dynamic_local_mem_size3;
u64 kernel_dynamic_local_mem_size4;
u64 kernel_dynamic_local_mem_size_init2;
u64 kernel_dynamic_local_mem_size_loop2;
u64 kernel_dynamic_local_mem_size_mp;
u64 kernel_dynamic_local_mem_size_mp_l;
u64 kernel_dynamic_local_mem_size_mp_r;
u64 kernel_dynamic_local_mem_size_amp;
u64 kernel_dynamic_local_mem_size_tm;
u64 kernel_dynamic_local_mem_size_memset;
u64 kernel_dynamic_local_mem_size_atinit;
u64 kernel_dynamic_local_mem_size_decompress;
u64 kernel_dynamic_local_mem_size_aux1;
u64 kernel_dynamic_local_mem_size_aux2;
u64 kernel_dynamic_local_mem_size_aux3;
u64 kernel_dynamic_local_mem_size_aux4;
u32 kernel_accel;
u32 kernel_accel_prev;
u32 kernel_accel_min;

File diff suppressed because it is too large Load Diff

@ -91,6 +91,8 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
{
fixed_local_size = 1;
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
}
else
{
@ -120,15 +122,37 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
{
fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
}
if (device_param->is_cuda == true)
{
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D DYNAMIC_LOCAL", fixed_local_size);
}
else
{
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
}
}
else
{
fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
if (device_param->is_cuda == true)
{
// using kernel_dynamic_local_mem_size_memset is a bit hackish.
// we had to brute-force this value out of an already loaded CUDA function.
// there's no official way to query for this value.
fixed_local_size = device_param->kernel_dynamic_local_mem_size_memset / 4096;
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D DYNAMIC_LOCAL", fixed_local_size);
}
else
{
fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
}
}
}
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
return jit_build_options;
}

Loading…
Cancel
Save