From 46f737c5afd371777f434411258101ff61b63723 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Fri, 10 May 2019 13:22:26 +0200 Subject: [PATCH] Use real constant memory on CUDA --- OpenCL/inc_common.h | 55 +++++++++++++++++++++++++++++++++-- OpenCL/inc_platform.cl | 20 +++++++++++++ src/backend.c | 65 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 130 insertions(+), 10 deletions(-) diff --git a/OpenCL/inc_common.h b/OpenCL/inc_common.h index bdcb16d38..e323d0e0a 100644 --- a/OpenCL/inc_common.h +++ b/OpenCL/inc_common.h @@ -26,6 +26,44 @@ * - P19: Type of the esalt_bufs structure with additional data, or void. */ +#ifdef IS_CUDA +#define KERN_ATTR(p2,p4,p5,p6,p19) \ + MAYBE_UNUSED GLOBAL_AS pw_t *pws, \ + MAYBE_UNUSED p2 const kernel_rule_t *g_rules_buf, \ + MAYBE_UNUSED GLOBAL_AS const pw_t *combs_buf, \ + MAYBE_UNUSED p4, \ + MAYBE_UNUSED GLOBAL_AS p5 *tmps, \ + MAYBE_UNUSED GLOBAL_AS p6 *hooks, \ + MAYBE_UNUSED GLOBAL_AS const u32 *bitmaps_buf_s1_a, \ + MAYBE_UNUSED GLOBAL_AS const u32 *bitmaps_buf_s1_b, \ + MAYBE_UNUSED GLOBAL_AS const u32 *bitmaps_buf_s1_c, \ + MAYBE_UNUSED GLOBAL_AS const u32 *bitmaps_buf_s1_d, \ + MAYBE_UNUSED GLOBAL_AS const u32 *bitmaps_buf_s2_a, \ + MAYBE_UNUSED GLOBAL_AS const u32 *bitmaps_buf_s2_b, \ + MAYBE_UNUSED GLOBAL_AS const u32 *bitmaps_buf_s2_c, \ + MAYBE_UNUSED GLOBAL_AS const u32 *bitmaps_buf_s2_d, \ + MAYBE_UNUSED GLOBAL_AS plain_t *plains_buf, \ + MAYBE_UNUSED GLOBAL_AS const digest_t *digests_buf, \ + MAYBE_UNUSED GLOBAL_AS u32 *hashes_shown, \ + MAYBE_UNUSED GLOBAL_AS const salt_t *salt_bufs, \ + MAYBE_UNUSED GLOBAL_AS const p19 *esalt_bufs, \ + MAYBE_UNUSED GLOBAL_AS u32 *d_return_buf, \ + MAYBE_UNUSED GLOBAL_AS void *d_extra0_buf, \ + MAYBE_UNUSED GLOBAL_AS void *d_extra1_buf, \ + MAYBE_UNUSED GLOBAL_AS void *d_extra2_buf, \ + MAYBE_UNUSED GLOBAL_AS void *d_extra3_buf, \ + MAYBE_UNUSED const u32 bitmap_mask, \ + MAYBE_UNUSED const u32 bitmap_shift1, \ + MAYBE_UNUSED const u32 bitmap_shift2, \ + MAYBE_UNUSED const u32 salt_pos, \ + MAYBE_UNUSED const u32 loop_pos, \ + MAYBE_UNUSED const u32 loop_cnt, \ + MAYBE_UNUSED const u32 il_cnt, \ + MAYBE_UNUSED const u32 digests_cnt, \ + MAYBE_UNUSED const u32 digests_offset, \ + MAYBE_UNUSED const u32 combs_mode, \ + MAYBE_UNUSED const u64 gid_max +#else #define KERN_ATTR(p2,p4,p5,p6,p19) \ MAYBE_UNUSED GLOBAL_AS pw_t *pws, \ MAYBE_UNUSED p2 const kernel_rule_t *rules_buf, \ @@ -62,7 +100,7 @@ MAYBE_UNUSED const u32 digests_offset, \ MAYBE_UNUSED const u32 combs_mode, \ MAYBE_UNUSED const u64 gid_max - +#endif /* * Shortcut macros for usage in the actual kernels * @@ -71,8 +109,20 @@ * do not use rules or tmps, etc. */ +#ifdef IS_CUDA +#define KERN_ATTR_BASIC() KERN_ATTR (GLOBAL_AS, GLOBAL_AS const bf_t *bfs_buf, void, void, void) +#define KERN_ATTR_BITSLICE() KERN_ATTR (GLOBAL_AS, CONSTANT_AS const bs_word_t *g_words_buf_s, void, void, void) +#define KERN_ATTR_ESALT(e) KERN_ATTR (GLOBAL_AS, GLOBAL_AS const bf_t *bfs_buf, void, void, e) +#define KERN_ATTR_RULES() KERN_ATTR (CONSTANT_AS, GLOBAL_AS const bf_t *bfs_buf, void, void, void) +#define KERN_ATTR_RULES_ESALT(e) KERN_ATTR (CONSTANT_AS, GLOBAL_AS const bf_t *bfs_buf, void, void, e) +#define KERN_ATTR_TMPS(t) KERN_ATTR (GLOBAL_AS, GLOBAL_AS const bf_t *bfs_buf, t, void, void) +#define KERN_ATTR_TMPS_ESALT(t,e) KERN_ATTR (GLOBAL_AS, GLOBAL_AS const bf_t *bfs_buf, t, void, e) +#define KERN_ATTR_TMPS_HOOKS(t,h) KERN_ATTR (GLOBAL_AS, GLOBAL_AS const bf_t *bfs_buf, t, h, void) +#define KERN_ATTR_VECTOR() KERN_ATTR (GLOBAL_AS, CONSTANT_AS const u32x *g_words_buf_r, void, void, void) +#define KERN_ATTR_VECTOR_ESALT(e) KERN_ATTR (GLOBAL_AS, CONSTANT_AS const u32x *g_words_buf_r, void, void, e) +#else #define KERN_ATTR_BASIC() KERN_ATTR (GLOBAL_AS, GLOBAL_AS const bf_t *bfs_buf, void, void, void) -#define KERN_ATTR_BITSLICE() KERN_ATTR (GLOBAL_AS, CONSTANT_AS const bs_word_t *words_buf_r, void, void, void) +#define KERN_ATTR_BITSLICE() KERN_ATTR (GLOBAL_AS, CONSTANT_AS const bs_word_t *words_buf_s, void, void, void) #define KERN_ATTR_ESALT(e) KERN_ATTR (GLOBAL_AS, GLOBAL_AS const bf_t *bfs_buf, void, void, e) #define KERN_ATTR_RULES() KERN_ATTR (CONSTANT_AS, GLOBAL_AS const bf_t *bfs_buf, void, void, void) #define KERN_ATTR_RULES_ESALT(e) KERN_ATTR (CONSTANT_AS, GLOBAL_AS const bf_t *bfs_buf, void, void, e) @@ -81,6 +131,7 @@ #define KERN_ATTR_TMPS_HOOKS(t,h) KERN_ATTR (GLOBAL_AS, GLOBAL_AS const bf_t *bfs_buf, t, h, void) #define KERN_ATTR_VECTOR() KERN_ATTR (GLOBAL_AS, CONSTANT_AS const u32x *words_buf_r, void, void, void) #define KERN_ATTR_VECTOR_ESALT(e) KERN_ATTR (GLOBAL_AS, CONSTANT_AS const u32x *words_buf_r, void, void, e) +#endif // union based packing diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl index e5924dd13..3606804b4 100644 --- a/OpenCL/inc_platform.cl +++ b/OpenCL/inc_platform.cl @@ -13,6 +13,26 @@ #ifdef IS_CUDA +#if ATTACK_EXEC == 11 + +CONSTANT_VK u32 generic_constant[8192]; // 32k + +#if ATTACK_KERN == 0 +#define rules_buf ((const kernel_rule_t *) generic_constant) +#define words_buf_s g_words_buf_s +#define words_buf_r g_words_buf_r +#elif ATTACK_KERN == 1 +#define rules_buf g_rules_buf +#define words_buf_s g_words_buf_s +#define words_buf_r g_words_buf_r +#elif ATTACK_KERN == 3 +#define rules_buf g_rules_buf +#define words_buf_s ((const bs_word_t *) generic_constant) +#define words_buf_r ((const u32x *) generic_constant) +#endif + +#endif + DECLSPEC u32 atomic_dec (u32 *p) { return atomicSub (p, 1); diff --git a/src/backend.c b/src/backend.c index 9857dff0c..ddbaf6456 100644 --- a/src/backend.c +++ b/src/backend.c @@ -1490,6 +1490,34 @@ int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmod return 0; } +int hc_cuModuleGetGlobal (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name) +{ + backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; + + CUDA_PTR *cuda = backend_ctx->cuda; + + const CUresult CU_err = cuda->cuModuleGetGlobal (dptr, bytes, hmod, name); + + if (CU_err != CUDA_SUCCESS) + { + const char *pStr = NULL; + + if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS) + { + event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %s", pStr); + } + else + { + event_log_error (hashcat_ctx, "cuModuleGetGlobal(): %d", CU_err); + } + + return -1; + } + + return 0; +} + + int hc_cuFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc) { backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; @@ -7346,9 +7374,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) // we don't have sm_* on vendors not NV but it doesn't matter #if defined (DEBUG) - build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type); + build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D _unroll ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern); #else - build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type); + build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D _unroll -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern); #endif build_options_buf[build_options_len] = 0; @@ -8276,7 +8304,17 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT) { CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules, size_rules); if (CU_rc == -1) return -1; - CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules_c, size_rules_c); if (CU_rc == -1) return -1; + + if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + { + size_t dummy; + + CU_rc = hc_cuModuleGetGlobal (hashcat_ctx, &device_param->cuda_d_rules_c, &dummy, device_param->cuda_module, "generic_constant"); if (CU_rc == -1) return -1; + } + else + { + CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules_c, size_rules_c); if (CU_rc == -1) return -1; + } CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_rules, straight_ctx->kernel_rules_buf, size_rules); if (CU_rc == -1) return -1; } @@ -8290,10 +8328,21 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) else if (user_options_extra->attack_kern == ATTACK_KERN_BF) { CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bfs, size_bfs); if (CU_rc == -1) return -1; - CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bfs_c, size_bfs); if (CU_rc == -1) return -1; - CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_tm_c, size_tm); if (CU_rc == -1) return -1; CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_root_css_buf, size_root_css); if (CU_rc == -1) return -1; CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_markov_css_buf, size_markov_css); if (CU_rc == -1) return -1; + + if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + { + size_t dummy; + + CU_rc = hc_cuModuleGetGlobal (hashcat_ctx, &device_param->cuda_d_bfs_c, &dummy, device_param->cuda_module, "generic_constant"); if (CU_rc == -1) return -1; + CU_rc = hc_cuModuleGetGlobal (hashcat_ctx, &device_param->cuda_d_tm_c, &dummy, device_param->cuda_module, "generic_constant"); if (CU_rc == -1) return -1; + } + else + { + CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_bfs_c, size_bfs); if (CU_rc == -1) return -1; + CU_rc = hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_tm_c, size_tm); if (CU_rc == -1) return -1; + } } } @@ -10665,11 +10714,11 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) if (device_param->cuda_d_pws_comp_buf) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_pws_comp_buf); if (device_param->cuda_d_pws_idx) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_pws_idx); if (device_param->cuda_d_rules) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_rules); - if (device_param->cuda_d_rules_c) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_rules_c); + //if (device_param->cuda_d_rules_c) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_rules_c); if (device_param->cuda_d_combs) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_combs); if (device_param->cuda_d_combs_c) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_combs_c); if (device_param->cuda_d_bfs) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bfs); - if (device_param->cuda_d_bfs_c) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bfs_c); + //if (device_param->cuda_d_bfs_c) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bfs_c); if (device_param->cuda_d_bitmap_s1_a) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bitmap_s1_a); if (device_param->cuda_d_bitmap_s1_b) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bitmap_s1_b); if (device_param->cuda_d_bitmap_s1_c) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_bitmap_s1_c); @@ -10692,7 +10741,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) if (device_param->cuda_d_extra3_buf) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_extra3_buf); if (device_param->cuda_d_root_css_buf) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_root_css_buf); if (device_param->cuda_d_markov_css_buf) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_markov_css_buf); - if (device_param->cuda_d_tm_c) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_tm_c); + //if (device_param->cuda_d_tm_c) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_tm_c); if (device_param->cuda_d_st_digests_buf) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_st_digests_buf); if (device_param->cuda_d_st_salts_buf) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_st_salts_buf); if (device_param->cuda_d_st_esalts_buf) hc_cuMemFree (hashcat_ctx, device_param->cuda_d_st_esalts_buf);