From 76e388ea47d1193c540b4d9889ad6f217a297511 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Wed, 18 Aug 2021 16:10:51 +0200
Subject: [PATCH] Memory Management: Check in OpenCL that
 CL_DEVICE_HOST_UNIFIED_MEMORY is true and if so, then half of the available
 memory NEO driver: Activate the NEO driver (Intel GPU) after it passed
 several tests in hashcat using the latest NEO driver version

---
 docs/changes.txt |  2 ++
 include/types.h  |  1 +
 src/backend.c    | 39 +++++++++++++++++++++++++++++----------
 3 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/docs/changes.txt b/docs/changes.txt
index 9e6862f00..979eb5c9b 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -65,6 +65,8 @@
 - HIP Kernels: Dependency on hip/hip runtime.h has been removed to enable easier integration of the HIP backend under Windows
 - Kernel cache: Add kernel threads for hash calculation, which will later be used in the file name of the kernel cache
 - Memory Management: Refactored the code responsible for limiting kernel accel with the goal to avoid low host memory situations
+- Memory Management: Check in OpenCL that CL_DEVICE_HOST_UNIFIED_MEMORY is true and if so, then half of the available memory
+- NEO driver: Activate the NEO driver (Intel GPU) after it passed several tests in hashcat using the latest NEO driver version
 - OpenCL Runtime: Workaround for Intel OpenCL runtime: segmentation fault when compiling hc_enc_next() / hc_enc_next_global()
 - RC4 Kernels: Use improved native thread derivation for RC4-based hash modes 7500, 13100, 18200, 25400
 - Shared Memory: Calculate the dynamic memory size of the kernel based on CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN
diff --git a/include/types.h b/include/types.h
index e180da4e8..0a9ac6f92 100644
--- a/include/types.h
+++ b/include/types.h
@@ -1111,6 +1111,7 @@ typedef struct hc_device_param
   u64     device_maxmem_alloc;
   u64     device_global_mem;
   u64     device_available_mem;
+  int     device_host_unified_memory;
   u32     device_maxclock_frequency;
   size_t  device_maxworkgroup_size;
   u64     device_local_mem_size;
diff --git a/src/backend.c b/src/backend.c
index 8c1da3e38..65336dd5e 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -8618,6 +8618,18 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
         device_param->device_processors = device_processors;
 
+        // device_host_unified_memory
+
+        cl_bool device_host_unified_memory = false;
+
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof (device_host_unified_memory), &device_host_unified_memory, NULL) == -1)
+        {
+          device_param->skipped = true;
+          continue;
+        }
+
+        device_param->device_host_unified_memory = (device_host_unified_memory == CL_TRUE) ? 1 : 0;
+
         // device_global_mem
 
         cl_ulong device_global_mem = 0;
@@ -8644,6 +8656,13 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
         device_param->device_maxmem_alloc = device_maxmem_alloc;
 
+        if (device_param->device_host_unified_memory == 1)
+        {
+          // so, we actually have only half the memory because we need the same buffers on host side
+
+          device_param->device_maxmem_alloc /= 2;
+        }
+
         // note we'll limit to 2gb, otherwise this causes all kinds of weird errors because of possible integer overflows in opencl runtimes
         // testwise disabling that
         //device_param->device_maxmem_alloc = MIN (device_maxmem_alloc, 0x7fffffff);
@@ -8914,14 +8933,11 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
         lowercase ((u8 *) opencl_device_version_lower, strlen (opencl_device_version_lower));
 
-        if ((strstr (opencl_device_version_lower, "neo "))
-         || (strstr (opencl_device_version_lower, " neo"))
-         || (strstr (opencl_device_version_lower, "beignet "))
+        if ((strstr (opencl_device_version_lower, "beignet "))
          || (strstr (opencl_device_version_lower, " beignet"))
          || (strstr (opencl_device_version_lower, "mesa "))
          || (strstr (opencl_device_version_lower, " mesa")))
         {
-          // NEO:     https://github.com/hashcat/hashcat/issues/2342
           // BEIGNET: https://github.com/hashcat/hashcat/issues/2243
           // MESA:    https://github.com/hashcat/hashcat/issues/2269
 
@@ -9694,11 +9710,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE;
 
-      #if defined (_WIN)
-      if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
-      #else
-      if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) || (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)))
-      #endif
+      if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
       {
         // OK, so the problem here is the following:
         // There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device,
@@ -9771,6 +9783,13 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
 
       hc_clReleaseContext (hashcat_ctx, context);
+
+      if (device_param->device_host_unified_memory == 1)
+      {
+        // so, we actually have only half the memory because we need the same buffers on host side
+
+        device_param->device_available_mem /= 2;
+      }
     }
   }
 
@@ -14614,7 +14633,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
       u64 EXTRA_SPACE = (1024ULL * 1024ULL) * kernel_accel_max;
 
-      EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
+      EXTRA_SPACE = MAX (EXTRA_SPACE, ( 256ULL * 1024ULL * 1024ULL));
       EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
 
       if ((size_pws   + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;