diff --git a/include/backend.h b/include/backend.h
index 057edb87f..e2ea51fb7 100644
--- a/include/backend.h
+++ b/include/backend.h
@@ -42,6 +42,7 @@ int hc_nvrtcGetPTX               (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog,
 int hc_cuCtxCreate               (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev);
 int hc_cuCtxDestroy              (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
 int hc_cuCtxSetCurrent           (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
+int hc_cuCtxSetCacheConfig       (hashcat_ctx_t *hashcat_ctx, CUfunc_cache config);
 int hc_cuCtxSynchronize          (hashcat_ctx_t *hashcat_ctx);
 int hc_cuDeviceGetAttribute      (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev);
 int hc_cuDeviceGetCount          (hashcat_ctx_t *hashcat_ctx, int *count);
@@ -56,6 +57,7 @@ int hc_cuEventQuery              (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
 int hc_cuEventRecord             (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream);
 int hc_cuEventSynchronize        (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
 int hc_cuFuncGetAttribute        (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+int hc_cuFuncSetAttribute        (hashcat_ctx_t *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value);
 int hc_cuInit                    (hashcat_ctx_t *hashcat_ctx, unsigned int Flags);
 int hc_cuLaunchKernel            (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
 int hc_cuMemAlloc                (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
diff --git a/include/ext_cuda.h b/include/ext_cuda.h
index eb8967f09..49257acbb 100644
--- a/include/ext_cuda.h
+++ b/include/ext_cuda.h
@@ -1024,6 +1024,7 @@ typedef struct hc_cuda_lib
   CUDA_CUCTXGETSHAREDMEMCONFIG  cuCtxGetSharedMemConfig;
   CUDA_CUCTXPOPCURRENT          cuCtxPopCurrent;
   CUDA_CUCTXPUSHCURRENT         cuCtxPushCurrent;
+  CUDA_CUCTXSETCACHECONFIG      cuCtxSetCacheConfig;
   CUDA_CUCTXSETCURRENT          cuCtxSetCurrent;
   CUDA_CUCTXSETSHAREDMEMCONFIG  cuCtxSetSharedMemConfig;
   CUDA_CUCTXSYNCHRONIZE         cuCtxSynchronize;
diff --git a/src/backend.c b/src/backend.c
index ed88b191f..97c9d6b6e 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -886,6 +886,7 @@ int cuda_init (hashcat_ctx_t *hashcat_ctx)
   HC_LOAD_FUNC (cuda, cuCtxGetSharedMemConfig,  CUDA_CUCTXGETSHAREDMEMCONFIG,   CUDA, 1);
   HC_LOAD_FUNC (cuda, cuCtxPopCurrent,          CUDA_CUCTXPOPCURRENT,           CUDA, 1);
   HC_LOAD_FUNC (cuda, cuCtxPushCurrent,         CUDA_CUCTXPUSHCURRENT,          CUDA, 1);
+  HC_LOAD_FUNC (cuda, cuCtxSetCacheConfig,      CUDA_CUCTXSETCACHECONFIG,       CUDA, 1);
   HC_LOAD_FUNC (cuda, cuCtxSetCurrent,          CUDA_CUCTXSETCURRENT,           CUDA, 1);
   HC_LOAD_FUNC (cuda, cuCtxSetSharedMemConfig,  CUDA_CUCTXSETSHAREDMEMCONFIG,   CUDA, 1);
   HC_LOAD_FUNC (cuda, cuCtxSynchronize,         CUDA_CUCTXSYNCHRONIZE,          CUDA, 1);
@@ -1467,6 +1468,33 @@ int hc_cuFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attri
   return 0;
 }
 
+int hc_cuFuncSetAttribute (hashcat_ctx_t *hashcat_ctx, CUfunction hfunc, CUfunction_attribute attrib, int value)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuFuncSetAttribute (hfunc, attrib, value);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuFuncSetAttribute(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_cuStreamCreate (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags)
 {
   backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
@@ -1764,6 +1792,35 @@ int hc_cuEventSynchronize (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
   return 0;
 }
 
+int hc_cuCtxSetCacheConfig (hashcat_ctx_t *hashcat_ctx, CUfunc_cache config)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuCtxSetCacheConfig (config);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuCtxSetCacheConfig(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+
+
 // OpenCL
 
 int ocl_init (hashcat_ctx_t *hashcat_ctx)
@@ -5398,6 +5455,11 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if (rc_cuCtxSetCurrent == -1) return -1;
 
+      // bcrypt optimization?
+      //const int rc_cuCtxSetCacheConfig = hc_cuCtxSetCacheConfig (hashcat_ctx, CU_FUNC_CACHE_PREFER_SHARED);
+      //
+      //if (rc_cuCtxSetCacheConfig == -1) return -1;
+
       const bool has_bfe = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");
 
       device_param->has_bfe = has_bfe;