OpenCL Runtime: Unlocked maximum thread count for NVIDIA GPU

2025-07-28 09:28:18 +00:00 · 2020-01-24 13:24:19 +01:00 · 2020-01-24 13:24:19 +01:00 · 041a777025
commit 041a777025
parent ccacc508cb
2 changed files with 20 additions and 3 deletions
--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -108,7 +108,7 @@
 - OpenCL Runtime: Improve ROCm detection and make sure to not confuse with recent AMDGPU drivers
 - OpenCL Runtime: Not using amd_bytealign (amd_bitalign is fine) on AMDGPU driver drastically reduces JiT segfaults
 - OpenCL Runtime: Reenabled support for Intel GPU OpenCL runtime (Beignet and NEO) because a workaround was found (force -cl-std=CL2.0)
- OpenCL Runtime: Unlocked maximum thread count
+- OpenCL Runtime: Unlocked maximum thread count for NVIDIA GPU
 - OpenCL Runtime: Update unstable mode warnings for Apple and AMDGPU drivers
 - OpenCL Runtime: Workaround JiT compiler error on AMDGPU driver compiling WPA-EAPOL-PBKDF2 OpenCL kernel
 - OpenCL Runtime: Workaround JiT compiler error on ROCm 2.3 driver if the 'inline' keyword is used in function declaration
--- a/src/backend.c
+++ b/src/backend.c
@ -6722,14 +6722,31 @@ static u32 get_kernel_threads (const hc_device_param_t *device_param)

  kernel_threads_max = MIN (kernel_threads_max, device_maxworkgroup_size);

-  // for CPU we just do 1 ...
-
  if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
  {
+    // for all CPU we just do 1 ...
+
    const u32 cpu_prefered_thread_count = 1;

    kernel_threads_max = MIN (kernel_threads_max, cpu_prefered_thread_count);
  }
+  else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+  {
+    // for GPU we need to distinguish by vendor
+
+    if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK)
+    {
+      const u32 gpu_prefered_thread_count = 8;
+
+      kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count);
+    }
+    else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
+    {
+      const u32 gpu_prefered_thread_count = 64;
+
+      kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count);
+    }
+  }

  // this is intenionally! at this point, kernel_threads_min can be higher than kernel_threads_max.
  // in this case we actually want kernel_threads_min selected.