mirror of
https://github.com/hashcat/hashcat.git
synced 2024-11-21 23:58:07 +00:00
SCRYPT Kernels: Add more optimized values for some new NV/AMD GPUs and new semi-automated derivation process description
Blowfish Kernels: Backport optimizations reducing bank conflicts from bcrypt to Password Safe v2 and Open Document Format (ODF) 1.1
This commit is contained in:
parent
532a154542
commit
25f1c12e3c
@ -19,6 +19,7 @@
|
||||
##
|
||||
|
||||
- AMD GPUs: Add inline assembly code for md5crypt/sha256crypt, PDF 1.7, 7-Zip, RAR3, Samsung Android and Windows Phone 8+
|
||||
- AMD GPUs: On Apple OpenCL platform, we ask for the preferred kernel thread size rather than hard-coding 32
|
||||
- Blake Kernels: Optimize BLAKE2B_ROUND() 64 bit rotates giving a 5% performance increase
|
||||
- Blowfish Kernels: Backport optimizations reducing bank conflicts from bcrypt to Password Safe v2 and Open Document Format (ODF) 1.1
|
||||
- Brain Session: Adds hashconfig specific opti_type and opts_type parameters to hashcat session computation to cover features like -O and -M
|
||||
@ -31,7 +32,10 @@
|
||||
##
|
||||
|
||||
- ADL: Updated support for AMD Display Library to 15.0, updated datatypes and added support for OverDrive 7 and 8 based GPUs
|
||||
- AMD Driver: Updated requirement for AMD Linux driver to ROCm 4.4 or later because of new HIP Interface
|
||||
- AMD Driver: Updated requirement for AMD Windows driver to Adrenalin 21.2.1 or later because of new ADL library
|
||||
- Commandline: Throw an error if separator character given by the user with -p option is not exactly 1 byte
|
||||
- ECC secp256k1: Removed the inline assembly code for AMD GPUs because the latest JIT compilers optimize it with the same efficiency
|
||||
- HIP Kernels: Got rid of hip/hip_runtime.h dependancy to enable more easy integration of the HIP backend on Windows
|
||||
- Kernel Cache: Add kernel threads into hash computation which is later used in the kernel cache filename
|
||||
- SCRYPT Kernels: Add more optimized values for some new NV/AMD GPUs
|
||||
|
@ -279,7 +279,14 @@ GeForce_RTX_3090 ALIAS_nv_sm50_or_higher
|
||||
##
|
||||
|
||||
Device_738c ALIAS_AMD_MI100
|
||||
|
||||
AMD_Radeon_(TM)_RX_480_Graphics ALIAS_AMD_RX480
|
||||
|
||||
Vega_10_XL/XT_[Radeon_RX_Vega_56/64] ALIAS_AMD_Vega64
|
||||
AMD_Radeon_Vega_64 ALIAS_AMD_Vega64
|
||||
|
||||
Device_73bf ALIAS_AMD_RX6900XT
|
||||
AMD_Radeon_RX_6900_XT ALIAS_AMD_RX6900XT
|
||||
|
||||
#############
|
||||
## ENTRIES ##
|
||||
@ -486,22 +493,41 @@ DEVICE_TYPE_GPU * 14500 1 A
|
||||
##
|
||||
## Find the ideal -n value, then store it here along with the proper compute device name.
|
||||
## Formatting guidelines are availabe at the top of this document.
|
||||
##
|
||||
## -------------------------------------------------
|
||||
##
|
||||
## You can also ignore all theoretical derivations and semi-automate the process in the real scenario (I prefer this approach):
|
||||
##
|
||||
## 1. For example, to find the value for 8900, first create a valid hash for 8900 as follows:
|
||||
##
|
||||
## $ ./hashcat --example-hashes -m 8900 | grep Example.Hash | grep -v Format | cut -b 25- > tmp.hash.8900
|
||||
##
|
||||
## 2. Now let it iterate through all -n values to a certain point. In this case, I'm using 200, but in general it's a value that is at least twice that of the multiprocessor. If you don't mind you can just leave it as it is, it just runs a little longer.
|
||||
##
|
||||
## $ export i=1; while [ $i -ne 201 ]; do echo $i; ./hashcat --quiet tmp.hash.8900 --keep-guessing --self-test-disable --markov-disable --restore-disable --outfile-autohex-disable --wordlist-autohex-disable --potfile-disable --logfile-disable --hwmon-disable --status --status-timer 1 --runtime 28 --machine-readable --optimized-kernel-enable --workload-profile 3 --hash-type 8900 --attack-mode 3 ?b?b?b?b?b?b?b --backend-devices 1 --force -n $i; i=$(($i+1)); done | tee x
|
||||
##
|
||||
## 3. Determine the highest measured H/s speed. But don't just use the highest value. Instead, use the number that seems most stable, usually at the beginning.
|
||||
##
|
||||
## $ grep "$(printf 'STATUS\t3')" x | cut -f4 -d$'\t' | sort -n | tail
|
||||
##
|
||||
## 4. To match the speed you have chosen to the correct value in the "x" file, simply search for it in it. Then go up a little on the block where you found him. The value -n is the single value that begins before the block start. If you have multiple blocks at the same speed, choose the lowest value for -n
|
||||
##
|
||||
|
||||
## 4GB
|
||||
GeForce_GTX_980 * 8900 1 28 A
|
||||
GeForce_GTX_980 * 8900 1 29 A
|
||||
GeForce_GTX_980 * 9300 1 128 A
|
||||
GeForce_GTX_980 * 15700 1 28 A
|
||||
GeForce_GTX_980 * 22700 1 28 A
|
||||
GeForce_GTX_980 * 15700 1 24 A
|
||||
GeForce_GTX_980 * 22700 1 29 A
|
||||
|
||||
## 8GB
|
||||
GeForce_GTX_1080 * 8900 1 14 A
|
||||
GeForce_GTX_1080 * 8900 1 15 A
|
||||
GeForce_GTX_1080 * 9300 1 256 A
|
||||
GeForce_GTX_1080 * 15700 1 14 A
|
||||
GeForce_GTX_1080 * 22700 1 14 A
|
||||
GeForce_GTX_1080 * 15700 1 28 A
|
||||
GeForce_GTX_1080 * 22700 1 15 A
|
||||
|
||||
## 11GB
|
||||
GeForce_RTX_2080_Ti * 8900 1 68 A
|
||||
GeForce_RTX_2080_Ti * 9300 1 532 A
|
||||
GeForce_RTX_2080_Ti * 9300 1 528 A
|
||||
GeForce_RTX_2080_Ti * 15700 1 68 A
|
||||
GeForce_RTX_2080_Ti * 22700 1 68 A
|
||||
|
||||
@ -509,7 +535,7 @@ GeForce_RTX_2080_Ti * 22700 1 68
|
||||
GeForce_RTX_3060_Ti * 8900 1 51 A
|
||||
GeForce_RTX_3060_Ti * 9300 1 256 A
|
||||
GeForce_RTX_3060_Ti * 15700 1 11 A
|
||||
GeForce_RTX_3060_Ti * 22700 1 43 A
|
||||
GeForce_RTX_3060_Ti * 22700 1 51 A
|
||||
|
||||
## 8GB
|
||||
GeForce_RTX_3070 * 8900 1 46 A
|
||||
@ -517,26 +543,32 @@ GeForce_RTX_3070 * 9300 1 368
|
||||
GeForce_RTX_3070 * 15700 1 22 A
|
||||
GeForce_RTX_3070 * 22700 1 46 A
|
||||
|
||||
## 24GB
|
||||
GeForce_RTX_3090 * 8900 1 82 A
|
||||
GeForce_RTX_3090 * 9300 1 984 A
|
||||
GeForce_RTX_3090 * 15700 1 82 A
|
||||
GeForce_RTX_3090 * 22700 1 82 A
|
||||
|
||||
## 4GB
|
||||
AMD_Radeon_(TM)_RX_480_Graphics * 8900 1 14 A
|
||||
AMD_Radeon_(TM)_RX_480_Graphics * 9300 1 126 A
|
||||
AMD_Radeon_(TM)_RX_480_Graphics * 15700 1 14 A
|
||||
AMD_Radeon_(TM)_RX_480_Graphics * 22700 1 14 A
|
||||
ALIAS_AMD_RX480 * 8900 1 15 A
|
||||
ALIAS_AMD_RX480 * 9300 1 232 A
|
||||
ALIAS_AMD_RX480 * 15700 1 58 A
|
||||
ALIAS_AMD_RX480 * 22700 1 15 A
|
||||
|
||||
## 8GB
|
||||
Vega_10_XL/XT_[Radeon_RX_Vega_56/64] * 8900 1 28 A
|
||||
Vega_10_XL/XT_[Radeon_RX_Vega_56/64] * 9300 1 442 A
|
||||
Vega_10_XL/XT_[Radeon_RX_Vega_56/64] * 15700 1 28 A
|
||||
Vega_10_XL/XT_[Radeon_RX_Vega_56/64] * 22700 1 28 A
|
||||
ALIAS_AMD_Vega64 * 8900 1 31 A
|
||||
ALIAS_AMD_Vega64 * 9300 1 440 A
|
||||
ALIAS_AMD_Vega64 * 15700 1 53 A
|
||||
ALIAS_AMD_Vega64 * 22700 1 31 A
|
||||
|
||||
## 32GB, WF64
|
||||
ALIAS_AMD_MI100 * 8900 1 76 A
|
||||
ALIAS_AMD_MI100 * 9300 1 288 A
|
||||
ALIAS_AMD_MI100 * 15700 1 76 A
|
||||
ALIAS_AMD_MI100 * 22700 1 76 A
|
||||
## 32GB
|
||||
ALIAS_AMD_MI100 * 8900 1 79 A
|
||||
ALIAS_AMD_MI100 * 9300 1 1000 A
|
||||
ALIAS_AMD_MI100 * 15700 1 120 A
|
||||
ALIAS_AMD_MI100 * 22700 1 79 A
|
||||
|
||||
## 16GB, WF32
|
||||
ALIAS_AMD_RX6900XT * 8900 1 62 A
|
||||
ALIAS_AMD_RX6900XT * 9300 1 704 A
|
||||
ALIAS_AMD_RX6900XT * 15700 1 62 A
|
||||
ALIAS_AMD_RX6900XT * 22700 1 62 A
|
||||
## 16GB
|
||||
ALIAS_AMD_RX6900XT * 8900 1 59 A
|
||||
ALIAS_AMD_RX6900XT * 9300 1 720 A
|
||||
ALIAS_AMD_RX6900XT * 15700 1 56 A
|
||||
ALIAS_AMD_RX6900XT * 22700 1 59 A
|
||||
|
@ -22,7 +22,8 @@ static const u64 KERN_TYPE = 9000;
|
||||
static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE;
|
||||
static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE
|
||||
| OPTS_TYPE_BINARY_HASHFILE
|
||||
| OPTS_TYPE_AUTODETECT_DISABLE;
|
||||
| OPTS_TYPE_AUTODETECT_DISABLE
|
||||
| OPTS_TYPE_DYNAMIC_SHARED;
|
||||
static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED;
|
||||
static const char *ST_PASS = "hashcat";
|
||||
static const char *ST_HASH = "0a3f352686e5eb5be173e668a4fff5cd5df420927e1da2d5d4052340160637e3e6a5a92841a188ed240e13b919f3d91694bd4c0acba79271e9c08a83ea5ad387cbb74d5884066a1cb5a8caa80d847079168f84823847c631dbe3a834f1bc496acfebac3bff1608bf1c857717f8f428e07b5e2cb12aaeddfa83d7dcb6d840234d08b84f8ca6c6e562af73eea13148f7902bcaf0220d3e36eeeff1d37283dc421483a2791182614ebb";
|
||||
@ -75,16 +76,25 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
|
||||
{
|
||||
char *jit_build_options = NULL;
|
||||
|
||||
// this mode heavily depends on the available shared memory size
|
||||
// note the kernel need to have some special code changes in order to make use to use post-48k memory region
|
||||
// we need to set some macros
|
||||
|
||||
bool use_dynamic = false;
|
||||
|
||||
if (device_param->is_cuda == true)
|
||||
{
|
||||
use_dynamic = true;
|
||||
}
|
||||
|
||||
// this uses some nice feedback effect.
|
||||
// based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value
|
||||
// which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result.
|
||||
// therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1.
|
||||
|
||||
u32 fixed_local_size = 0;
|
||||
|
||||
if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
|
||||
{
|
||||
fixed_local_size = 1;
|
||||
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -100,29 +110,58 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
|
||||
|
||||
if (device_param->is_opencl == true)
|
||||
{
|
||||
overhead = 4;
|
||||
overhead = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (user_options->kernel_threads_chgd == true)
|
||||
{
|
||||
fixed_local_size = user_options->kernel_threads;
|
||||
u32 fixed_local_size = user_options->kernel_threads;
|
||||
|
||||
// otherwise out-of-bound reads
|
||||
|
||||
if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead))
|
||||
if (use_dynamic == true)
|
||||
{
|
||||
fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
|
||||
if ((fixed_local_size * 4096) > device_param->kernel_dynamic_local_mem_size_memset)
|
||||
{
|
||||
// otherwise out-of-bound reads
|
||||
|
||||
fixed_local_size = device_param->kernel_dynamic_local_mem_size_memset / 4096;
|
||||
}
|
||||
|
||||
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D DYNAMIC_LOCAL", fixed_local_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead))
|
||||
{
|
||||
// otherwise out-of-bound reads
|
||||
|
||||
fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
|
||||
}
|
||||
|
||||
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
|
||||
if (use_dynamic == true)
|
||||
{
|
||||
// using kernel_dynamic_local_mem_size_memset is a bit hackish.
|
||||
// we had to brute-force this value out of an already loaded CUDA function.
|
||||
// there's no official way to query for this value.
|
||||
|
||||
const u32 fixed_local_size = device_param->kernel_dynamic_local_mem_size_memset / 4096;
|
||||
|
||||
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D DYNAMIC_LOCAL", fixed_local_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
const u32 fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
|
||||
|
||||
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
|
||||
|
||||
return jit_build_options;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user