Fixed race condition resulting in out of memory error on startup if multiple hashcat instances are started at the same time

2025-06-30 20:02:41 +00:00 · 2020-08-14 09:04:52 +02:00 · 2020-08-14 09:04:52 +02:00 · e21463da4b
commit e21463da4b
parent 6d5e1d3e5d
2 changed files with 307 additions and 254 deletions
--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -12,6 +12,13 @@

 - Fixed too early execution of some module functions which could make use of non-final values opts_type and opti_type
 - Fixed internal access on module option attribute OPTS_TYPE_SUGGEST_KG with the result that it was unused
+- Fixed race condition resulting in out of memory error on startup if multiple hashcat instances are started at the same time
+
+##
+## Improvements
+##
+
+- Startup time: Improved the startup time by avoiding some time intensive operations for skipped devices

 * changes v6.1.0 -> v6.1.1

--- a/src/backend.c
+++ b/src/backend.c
@ -5540,7 +5540,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
        device_param->skipped = true;
      }

-      // some attributes have to be hardcoded because they are used for instance in the build options
+      // some attributes have to be hardcoded values because they are used for instance in the build options

      device_param->device_local_mem_type     = CL_LOCAL;
      device_param->opencl_device_type        = CL_DEVICE_TYPE_GPU;
@ -5616,11 +5616,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
        cuda_devices_active++;
      }

-      CUcontext cuda_context;
-
-      if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
-
-      if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1;
+      // instruction set

      // bcrypt optimization?
      //const int rc_cuCtxSetCacheConfig = hc_cuCtxSetCacheConfig (hashcat_ctx, CU_FUNC_CACHE_PREFER_SHARED);
@ -5638,47 +5634,14 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
      device_param->has_mov64 = (sm >= 10) ? true : false;
      device_param->has_prmt  = (sm >= 20) ? true : false;

-      /*
-      #define RUN_INSTRUCTION_CHECKS()                                                                                                                                                                                                                      \
-        device_param->has_add   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                              \
-        device_param->has_addc  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                             \
-        device_param->has_sub   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                              \
-        device_param->has_subc  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                             \
-        device_param->has_bfe   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                              \
-        device_param->has_lop3  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");                                                          \
-        device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");  \
-        device_param->has_prmt  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                             \
-
-      if (backend_devices_idx > 0)
-      {
-        hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
-
-        if (is_same_device_type (device_param, device_param_prev) == true)
-        {
-          device_param->has_add   = device_param_prev->has_add;
-          device_param->has_addc  = device_param_prev->has_addc;
-          device_param->has_sub   = device_param_prev->has_sub;
-          device_param->has_subc  = device_param_prev->has_subc;
-          device_param->has_bfe   = device_param_prev->has_bfe;
-          device_param->has_lop3  = device_param_prev->has_lop3;
-          device_param->has_mov64 = device_param_prev->has_mov64;
-          device_param->has_prmt  = device_param_prev->has_prmt;
-        }
-        else
-        {
-          RUN_INSTRUCTION_CHECKS();
-        }
-      }
-      else
-      {
-        RUN_INSTRUCTION_CHECKS();
-      }
-
-      #undef RUN_INSTRUCTION_CHECKS
-      */
-
      // device_available_mem

+      CUcontext cuda_context;
+
+      if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
+
+      if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1;
+
      size_t free  = 0;
      size_t total = 0;

@ -6269,6 +6232,25 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
          }
        }

+        // instruction set
+
+        // fixed values works only for nvidia devices
+        // dynamical values for amd see time intensive section below
+
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
+        {
+          const int sm = (device_param->sm_major * 10) + device_param->sm_minor;
+
+          device_param->has_add   = (sm >= 12) ? true : false;
+          device_param->has_addc  = (sm >= 12) ? true : false;
+          device_param->has_sub   = (sm >= 12) ? true : false;
+          device_param->has_subc  = (sm >= 12) ? true : false;
+          device_param->has_bfe   = (sm >= 20) ? true : false;
+          device_param->has_lop3  = (sm >= 50) ? true : false;
+          device_param->has_mov64 = (sm >= 10) ? true : false;
+          device_param->has_prmt  = (sm >= 20) ? true : false;
+        }
+
        // common driver check

        if (device_param->skipped == false)
@ -6432,6 +6414,130 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)

          opencl_devices_active++;
        }
+      }
+    }
+  }
+
+  backend_ctx->opencl_devices_cnt     = opencl_devices_cnt;
+  backend_ctx->opencl_devices_active  = opencl_devices_active;
+
+  // all devices combined go into backend_* variables
+
+  backend_ctx->backend_devices_cnt    = cuda_devices_cnt    + opencl_devices_cnt;
+  backend_ctx->backend_devices_active = cuda_devices_active + opencl_devices_active;
+
+  // find duplicate devices
+
+  //if ((cuda_devices_cnt > 0) && (opencl_devices_cnt > 0))
+  //{
+    // using force here enables both devices, which is the worst possible outcome
+    // many users force by default, so this is not a good idea
+
+    //if (user_options->force == false)
+    //{
+    backend_ctx_find_alias_devices (hashcat_ctx);
+    //{
+  //}
+
+  if (backend_ctx->backend_devices_active == 0)
+  {
+    event_log_error (hashcat_ctx, "No devices found/left.");
+
+    return -1;
+  }
+
+  // now we can calculate the number of parallel running hook threads based on
+  // the number cpu cores and the number of active compute devices
+  // unless overwritten by the user
+
+  if (user_options->hook_threads == HOOK_THREADS)
+  {
+    const u32 processor_count = hc_get_processor_count ();
+
+    const u32 processor_count_cu = CEILDIV (processor_count, backend_ctx->backend_devices_active); // should never reach 0
+
+    user_options->hook_threads = processor_count_cu;
+  }
+
+  // additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt)
+
+  if (backend_ctx->backend_devices_filter != (u64) -1)
+  {
+    const u64 backend_devices_cnt_mask = ~(((u64) -1 >> backend_ctx->backend_devices_cnt) << backend_ctx->backend_devices_cnt);
+
+    if (backend_ctx->backend_devices_filter > backend_devices_cnt_mask)
+    {
+      event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter.");
+      event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt);
+
+      return -1;
+    }
+  }
+
+  // time or resource intensive operations which we do not run if the corresponding device was skipped by the user
+
+  if (backend_ctx->cuda)
+  {
+    // instruction test for cuda devices was replaced with fixed values (see above)
+
+    /*
+    CUcontext cuda_context;
+
+    if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
+
+    if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1;
+
+    #define RUN_INSTRUCTION_CHECKS()                                                                                                                                                                                                                      \
+      device_param->has_add   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                              \
+      device_param->has_addc  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                             \
+      device_param->has_sub   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                              \
+      device_param->has_subc  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");                                                             \
+      device_param->has_bfe   = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                              \
+      device_param->has_lop3  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");                                                          \
+      device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");  \
+      device_param->has_prmt  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                             \
+
+    if (backend_devices_idx > 0)
+    {
+      hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
+
+      if (is_same_device_type (device_param, device_param_prev) == true)
+      {
+        device_param->has_add   = device_param_prev->has_add;
+        device_param->has_addc  = device_param_prev->has_addc;
+        device_param->has_sub   = device_param_prev->has_sub;
+        device_param->has_subc  = device_param_prev->has_subc;
+        device_param->has_bfe   = device_param_prev->has_bfe;
+        device_param->has_lop3  = device_param_prev->has_lop3;
+        device_param->has_mov64 = device_param_prev->has_mov64;
+        device_param->has_prmt  = device_param_prev->has_prmt;
+      }
+      else
+      {
+        RUN_INSTRUCTION_CHECKS();
+      }
+    }
+    else
+    {
+      RUN_INSTRUCTION_CHECKS();
+    }
+
+    #undef RUN_INSTRUCTION_CHECKS
+
+    if (hc_cuCtxDestroy (hashcat_ctx, cuda_context) == -1) return -1;
+
+    */
+  }
+
+  if (backend_ctx->ocl)
+  {
+    for (int backend_devices_cnt = 0; backend_devices_cnt < backend_ctx->backend_devices_cnt; backend_devices_cnt++)
+    {
+      hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_cnt];
+
+      if (device_param->is_opencl == false) continue;
+
+      if (device_param->skipped == true) continue;

      /**
       * create context for each device
@ -6459,6 +6565,8 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)

      if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1) return -1;

+      // instruction set
+
      if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))
      {
        #define RUN_INSTRUCTION_CHECKS()
@ -6507,16 +6615,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)

      if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
      {
-          const int sm = (device_param->sm_major * 10) + device_param->sm_minor;
-
-          device_param->has_add   = (sm >= 12) ? true : false;
-          device_param->has_addc  = (sm >= 12) ? true : false;
-          device_param->has_sub   = (sm >= 12) ? true : false;
-          device_param->has_subc  = (sm >= 12) ? true : false;
-          device_param->has_bfe   = (sm >= 20) ? true : false;
-          device_param->has_lop3  = (sm >= 50) ? true : false;
-          device_param->has_mov64 = (sm >= 10) ? true : false;
-          device_param->has_prmt  = (sm >= 20) ? true : false;
+        // replaced with fixed values see non time intensive section above

        /*
        #define RUN_INSTRUCTION_CHECKS()                                                                                                                                                                                                          \
@ -6558,7 +6657,10 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
        */
      }

-        // device_available_mem
+      // available device memory
+      // This test causes an GPU memory usage spike.
+      // In case there are multiple hashcat instances starting at the same time this will cause GPU out of memory errors which otherwise would not exist.
+      // We will simply not run it if that device was skipped by the user.

      #define MAX_ALLOC_CHECKS_CNT  8192
      #define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024)
@ -6618,6 +6720,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
        }

        device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
+
        if (c > 0)
        {
          device_param->device_available_mem *= c;
@ -6643,63 +6746,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
      hc_clReleaseContext (hashcat_ctx, context);
    }
  }
-  }
-
-  backend_ctx->opencl_devices_cnt     = opencl_devices_cnt;
-  backend_ctx->opencl_devices_active  = opencl_devices_active;
-
-  // all devices combined go into backend_* variables
-
-  backend_ctx->backend_devices_cnt    = cuda_devices_cnt    + opencl_devices_cnt;
-  backend_ctx->backend_devices_active = cuda_devices_active + opencl_devices_active;
-
-  // find duplicate devices
-
-  //if ((cuda_devices_cnt > 0) && (opencl_devices_cnt > 0))
-  //{
-    // using force here enables both devices, which is the worst possible outcome
-    // many users force by default, so this is not a good idea
-
-    //if (user_options->force == false)
-    //{
-    backend_ctx_find_alias_devices (hashcat_ctx);
-    //{
-  //}
-
-  if (backend_ctx->backend_devices_active == 0)
-  {
-    event_log_error (hashcat_ctx, "No devices found/left.");
-
-    return -1;
-  }
-
-  // now we can calculate the number of parallel running hook threads based on
-  // the number cpu cores and the number of active compute devices
-  // unless overwritten by the user
-
-  if (user_options->hook_threads == HOOK_THREADS)
-  {
-    const u32 processor_count = hc_get_processor_count ();
-
-    const u32 processor_count_cu = CEILDIV (processor_count, backend_ctx->backend_devices_active); // should never reach 0
-
-    user_options->hook_threads = processor_count_cu;
-  }
-
-  // additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt)
-
-  if (backend_ctx->backend_devices_filter != (u64) -1)
-  {
-    const u64 backend_devices_cnt_mask = ~(((u64) -1 >> backend_ctx->backend_devices_cnt) << backend_ctx->backend_devices_cnt);
-
-    if (backend_ctx->backend_devices_filter > backend_devices_cnt_mask)
-    {
-      event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter.");
-      event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt);
-
-      return -1;
-    }
-  }

  backend_ctx->target_msec  = TARGET_MSEC_PROFILE[user_options->workload_profile - 1];