From a983957b217aa217edf821785b7727096e04048d Mon Sep 17 00:00:00 2001 From: Gabriele Gristina Date: Tue, 9 May 2023 18:52:20 +0200 Subject: [PATCH] Dependencies: Updated LZMA SDK to 22.01 --- deps/LZMA-SDK/C/7zTypes.h | 20 +- deps/LZMA-SDK/C/7zVersion.h | 10 +- deps/LZMA-SDK/C/7zip_gcc_c.mak | 62 +- deps/LZMA-SDK/C/Aes.c | 10 +- deps/LZMA-SDK/C/Alloc.c | 10 +- deps/LZMA-SDK/C/Alloc.h | 9 +- deps/LZMA-SDK/C/CpuArch.c | 68 +- deps/LZMA-SDK/C/CpuArch.h | 44 +- deps/LZMA-SDK/C/DllSecur.c | 16 +- deps/LZMA-SDK/C/LzFind.c | 1001 ++++++++++++++++------ deps/LZMA-SDK/C/LzFind.h | 41 +- deps/LZMA-SDK/C/LzFindMt.c | 864 ++++++++++++------- deps/LZMA-SDK/C/LzFindMt.h | 36 +- deps/LZMA-SDK/C/LzFindOpt.c | 578 +++++++++++++ deps/LZMA-SDK/C/LzmaEnc.c | 351 +++++--- deps/LZMA-SDK/C/MtCoder.c | 13 +- deps/LZMA-SDK/C/MtDec.c | 8 +- deps/LZMA-SDK/C/Threads.c | 48 +- deps/LZMA-SDK/C/Threads.h | 45 +- deps/LZMA-SDK/C/Util/Lzma/LzmaUtil.c | 7 +- deps/LZMA-SDK/C/Util/Lzma/LzmaUtil.dsp | 4 + deps/LZMA-SDK/C/Util/Lzma/makefile | 2 + deps/LZMA-SDK/C/Util/Lzma/makefile.gcc | 2 + deps/LZMA-SDK/C/Util/LzmaLib/LzmaLib.dsp | 4 + deps/LZMA-SDK/C/Util/LzmaLib/makefile | 2 + deps/LZMA-SDK/C/XzDec.c | 5 +- deps/LZMA-SDK/C/XzIn.c | 5 +- deps/LZMA-SDK/C/var_clang_x64.mak | 1 - deps/LZMA-SDK/C/var_clang_x86.mak | 1 - deps/LZMA-SDK/C/var_gcc_x86.mak | 1 - deps/LZMA-SDK/C/warn_gcc.mak | 2 - deps/LZMA-SDK/DOC/lzma-history.txt | 36 + deps/LZMA-SDK/DOC/lzma-sdk.txt | 65 +- docs/changes.txt | 1 + 34 files changed, 2532 insertions(+), 840 deletions(-) create mode 100644 deps/LZMA-SDK/C/LzFindOpt.c diff --git a/deps/LZMA-SDK/C/7zTypes.h b/deps/LZMA-SDK/C/7zTypes.h index 497b14506..763d3a9b6 100644 --- a/deps/LZMA-SDK/C/7zTypes.h +++ b/deps/LZMA-SDK/C/7zTypes.h @@ -1,5 +1,5 @@ /* 7zTypes.h -- Basic types -2021-04-25 : Igor Pavlov : Public domain */ +2022-04-01 : Igor Pavlov : Public domain */ #ifndef __7Z_TYPES_H #define __7Z_TYPES_H @@ -62,6 +62,8 @@ typedef int SRes; typedef unsigned WRes; #define MY_SRes_HRESULT_FROM_WRes(x) HRESULT_FROM_WIN32(x) +// #define MY_HRES_ERROR__INTERNAL_ERROR MY_SRes_HRESULT_FROM_WRes(ERROR_INTERNAL_ERROR) + #else // _WIN32 // #define ENV_HAVE_LSTAT @@ -95,6 +97,7 @@ typedef int WRes; #define ERROR_DIRECTORY 267L #define ERROR_TOO_MANY_POSTS 298L +#define ERROR_INTERNAL_ERROR 1359L #define ERROR_INVALID_REPARSE_DATA 4392L #define ERROR_REPARSE_TAG_INVALID 4393L #define ERROR_REPARSE_TAG_MISMATCH 4394L @@ -102,6 +105,7 @@ typedef int WRes; // we use errno equivalents for some WIN32 errors: +#define ERROR_INVALID_PARAMETER EINVAL #define ERROR_INVALID_FUNCTION EINVAL #define ERROR_ALREADY_EXISTS EEXIST #define ERROR_FILE_EXISTS EEXIST @@ -129,10 +133,6 @@ typedef int WRes; #define MY__E_ERROR_NEGATIVE_SEEK MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL) */ -// gcc / clang : (sizeof(long) == sizeof(void*)) in 32/64 bits -typedef long INT_PTR; -typedef unsigned long UINT_PTR; - #define TEXT(quote) quote #define FILE_ATTRIBUTE_READONLY 0x0001 @@ -206,6 +206,8 @@ typedef size_t SIZE_T; #endif // _WIN32 +#define MY_HRES_ERROR__INTERNAL_ERROR ((HRESULT)0x8007054FL) + #ifdef _SZ_NO_INT_64 @@ -514,6 +516,14 @@ struct ISzAlloc #endif +#define k_PropVar_TimePrec_0 0 +#define k_PropVar_TimePrec_Unix 1 +#define k_PropVar_TimePrec_DOS 2 +#define k_PropVar_TimePrec_HighPrec 3 +#define k_PropVar_TimePrec_Base 16 +#define k_PropVar_TimePrec_100ns (k_PropVar_TimePrec_Base + 7) +#define k_PropVar_TimePrec_1ns (k_PropVar_TimePrec_Base + 9) + EXTERN_C_END #endif diff --git a/deps/LZMA-SDK/C/7zVersion.h b/deps/LZMA-SDK/C/7zVersion.h index 0fe636abc..fa9e6fc53 100644 --- a/deps/LZMA-SDK/C/7zVersion.h +++ b/deps/LZMA-SDK/C/7zVersion.h @@ -1,7 +1,7 @@ -#define MY_VER_MAJOR 21 -#define MY_VER_MINOR 02 +#define MY_VER_MAJOR 22 +#define MY_VER_MINOR 01 #define MY_VER_BUILD 0 -#define MY_VERSION_NUMBERS "21.02 alpha" +#define MY_VERSION_NUMBERS "22.01" #define MY_VERSION MY_VERSION_NUMBERS #ifdef MY_CPU_NAME @@ -10,12 +10,12 @@ #define MY_VERSION_CPU MY_VERSION #endif -#define MY_DATE "2021-05-06" +#define MY_DATE "2022-07-15" #undef MY_COPYRIGHT #undef MY_VERSION_COPYRIGHT_DATE #define MY_AUTHOR_NAME "Igor Pavlov" #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" -#define MY_COPYRIGHT_CR "Copyright (c) 1999-2021 Igor Pavlov" +#define MY_COPYRIGHT_CR "Copyright (c) 1999-2022 Igor Pavlov" #ifdef USE_COPYRIGHT_CR #define MY_COPYRIGHT MY_COPYRIGHT_CR diff --git a/deps/LZMA-SDK/C/7zip_gcc_c.mak b/deps/LZMA-SDK/C/7zip_gcc_c.mak index 00ecfb043..d41810478 100644 --- a/deps/LZMA-SDK/C/7zip_gcc_c.mak +++ b/deps/LZMA-SDK/C/7zip_gcc_c.mak @@ -5,6 +5,7 @@ MY_ASM = jwasm MY_ASM = asmc PROGPATH = $(O)/$(PROG) +PROGPATH_STATIC = $(O)/$(PROG)s # for object file @@ -15,12 +16,32 @@ CFLAGS_BASE = $(MY_ARCH_2) -O2 $(CFLAGS_BASE_LIST) -Wall -Werror -Wextra $(CFLAG -DNDEBUG -D_REENTRANT -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -LDFLAGS_STATIC = -DNDEBUG -# -static - ifdef SystemDrive IS_MINGW = 1 +else +ifdef SYSTEMDRIVE +# ifdef OS +IS_MINGW = 1 endif +endif + +ifdef IS_MINGW +LDFLAGS_STATIC_2 = -static +else +ifndef DEF_FILE +ifndef IS_NOT_STANDALONE +ifndef MY_DYNAMIC_LINK +ifneq ($(CC), clang) +LDFLAGS_STATIC_2 = +# -static +# -static-libstdc++ -static-libgcc +endif +endif +endif +endif +endif + +LDFLAGS_STATIC = -DNDEBUG $(LDFLAGS_STATIC_2) ifdef DEF_FILE @@ -53,7 +74,7 @@ endif PROGPATH = $(O)/$(PROG)$(SHARED_EXT) - +PROGPATH_STATIC = $(O)/$(PROG)s$(SHARED_EXT) ifndef O O=_o @@ -61,15 +82,22 @@ endif ifdef IS_MINGW +ifdef MSYSTEM +RM = rm -f +MY_MKDIR=mkdir -p +DEL_OBJ_EXE = -$(RM) $(PROGPATH) $(PROGPATH_STATIC) $(OBJS) +else RM = del MY_MKDIR=mkdir -LIB2 = -loleaut32 -luuid -ladvapi32 -lUser32 +DEL_OBJ_EXE = -$(RM) $(O)\*.o $(O)\$(PROG).exe $(O)\$(PROG).dll +endif +LIB2 = -lOle32 -loleaut32 -luuid -ladvapi32 -lUser32 + CXXFLAGS_EXTRA = -DUNICODE -D_UNICODE # -Wno-delete-non-virtual-dtor -DEL_OBJ_EXE = -$(RM) $(O)\*.o $(O)\$(PROG).exe $(O)\$(PROG).dll else @@ -82,7 +110,7 @@ MY_MKDIR=mkdir -p # LOCAL_LIBS_DLL=$(LOCAL_LIBS) -ldl LIB2 = -lpthread -ldl -DEL_OBJ_EXE = -$(RM) $(PROGPATH) $(OBJS) +DEL_OBJ_EXE = -$(RM) $(PROGPATH) $(PROGPATH_STATIC) $(OBJS) endif @@ -108,14 +136,23 @@ CXX_WARN_FLAGS = CXXFLAGS = $(LOCAL_FLAGS) $(CXXFLAGS_BASE2) $(CFLAGS_BASE) $(CXXFLAGS_EXTRA) $(CC_SHARED) -o $@ $(CXX_WARN_FLAGS) -all: $(O) $(PROGPATH) +STATIC_TARGET= +ifdef COMPL_STATIC +STATIC_TARGET=$(PROGPATH_STATIC) +endif + + +all: $(O) $(PROGPATH) $(STATIC_TARGET) $(O): $(MY_MKDIR) $(O) +LFLAGS_ALL = -s $(MY_ARCH_2) $(LDFLAGS) $(LD_arch) $(OBJS) $(MY_LIBS) $(LIB2) $(PROGPATH): $(OBJS) - $(CXX) -s -o $(PROGPATH) $(MY_ARCH_2) $(LDFLAGS) $(OBJS) $(MY_LIBS) $(LIB2) + $(CXX) -o $(PROGPATH) $(LFLAGS_ALL) +$(PROGPATH_STATIC): $(OBJS) + $(CXX) -static -o $(PROGPATH_STATIC) $(LFLAGS_ALL) ifndef NO_DEFAULT_RES @@ -174,6 +211,8 @@ $O/LzFind.o: ../../../C/LzFind.c # ifdef MT_FILES $O/LzFindMt.o: ../../../C/LzFindMt.c $(CC) $(CFLAGS) $< +$O/LzFindOpt.o: ../../../C/LzFindOpt.c + $(CC) $(CFLAGS) $< $O/Threads.o: ../../../C/Threads.c $(CC) $(CFLAGS) $< @@ -294,7 +333,10 @@ $O/7zMain.o: ../../../C/Util/7z/7zMain.c $(CC) $(CFLAGS) $< $O/LzmaUtil.o: ../../../C/Util/Lzma/LzmaUtil.c $(CC) $(CFLAGS) $< - +$O/7zipInstall.o: ../../../C/Util/7zipInstall/7zipInstall.c + $(CC) $(CFLAGS) $< +$O/7zipUninstall.o: ../../../C/Util/7zipUninstall/7zipUninstall.c + $(CC) $(CFLAGS) $< clean: diff --git a/deps/LZMA-SDK/C/Aes.c b/deps/LZMA-SDK/C/Aes.c index 0f0ddc87a..9ad66c5c8 100644 --- a/deps/LZMA-SDK/C/Aes.c +++ b/deps/LZMA-SDK/C/Aes.c @@ -1,5 +1,5 @@ /* Aes.c -- AES encryption / decryption -2021-04-01 : Igor Pavlov : Public domain */ +2021-05-13 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -365,10 +365,10 @@ void MY_FAST_CALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks) #ifdef MY_CPU_LE_UNALIGN *((UInt32 *)(void *)data) ^= t; #else - data[0] ^= (t & 0xFF); - data[1] ^= ((t >> 8) & 0xFF); - data[2] ^= ((t >> 16) & 0xFF); - data[3] ^= ((t >> 24)); + data[0] = (Byte)(data[0] ^ (t & 0xFF)); + data[1] = (Byte)(data[1] ^ ((t >> 8) & 0xFF)); + data[2] = (Byte)(data[2] ^ ((t >> 16) & 0xFF)); + data[3] = (Byte)(data[3] ^ ((t >> 24))); #endif } } diff --git a/deps/LZMA-SDK/C/Alloc.c b/deps/LZMA-SDK/C/Alloc.c index 064701a8c..142a1ea22 100644 --- a/deps/LZMA-SDK/C/Alloc.c +++ b/deps/LZMA-SDK/C/Alloc.c @@ -1,12 +1,12 @@ /* Alloc.c -- Memory allocation functions -2020-10-29 : Igor Pavlov : Public domain */ +2021-07-13 : Igor Pavlov : Public domain */ #include "Precomp.h" #include #ifdef _WIN32 -#include +#include #endif #include @@ -247,14 +247,14 @@ static void *SzAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MyAlloc static void SzFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MyFree(address); } const ISzAlloc g_Alloc = { SzAlloc, SzFree }; +#ifdef _WIN32 static void *SzMidAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MidAlloc(size); } static void SzMidFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MidFree(address); } -const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree }; - static void *SzBigAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return BigAlloc(size); } static void SzBigFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); BigFree(address); } +const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree }; const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; - +#endif /* uintptr_t : C99 (optional) diff --git a/deps/LZMA-SDK/C/Alloc.h b/deps/LZMA-SDK/C/Alloc.h index a1bbe942c..59de10760 100644 --- a/deps/LZMA-SDK/C/Alloc.h +++ b/deps/LZMA-SDK/C/Alloc.h @@ -1,5 +1,5 @@ /* Alloc.h -- Memory allocation functions -2021-02-08 : Igor Pavlov : Public domain */ +2021-07-13 : Igor Pavlov : Public domain */ #ifndef __COMMON_ALLOC_H #define __COMMON_ALLOC_H @@ -30,8 +30,15 @@ void BigFree(void *address); #endif extern const ISzAlloc g_Alloc; + +#ifdef _WIN32 extern const ISzAlloc g_BigAlloc; extern const ISzAlloc g_MidAlloc; +#else +#define g_BigAlloc g_AlignedAlloc +#define g_MidAlloc g_AlignedAlloc +#endif + extern const ISzAlloc g_AlignedAlloc; diff --git a/deps/LZMA-SDK/C/CpuArch.c b/deps/LZMA-SDK/C/CpuArch.c index e1443f51b..a0e93e8b0 100644 --- a/deps/LZMA-SDK/C/CpuArch.c +++ b/deps/LZMA-SDK/C/CpuArch.c @@ -1,5 +1,5 @@ /* CpuArch.c -- CPU specific code -2021-04-28 : Igor Pavlov : Public domain */ +2021-07-13 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -217,7 +217,7 @@ BoolInt CPU_Is_InOrder() } #if !defined(MY_CPU_AMD64) && defined(_WIN32) -#include +#include static BoolInt CPU_Sys_Is_SSE_Supported() { OSVERSIONINFO vi; @@ -275,9 +275,33 @@ BoolInt CPU_IsSupported_SHA() // #include #ifdef _WIN32 -#include +#include #endif +BoolInt CPU_IsSupported_AVX2() +{ + Cx86cpuid p; + CHECK_SYS_SSE_SUPPORT + + #ifdef _WIN32 + #define MY__PF_XSAVE_ENABLED 17 + if (!IsProcessorFeaturePresent(MY__PF_XSAVE_ENABLED)) + return False; + #endif + + if (!x86cpuid_CheckAndRead(&p)) + return False; + if (p.maxFunc < 7) + return False; + { + UInt32 d[4] = { 0 }; + MyCPUID(7, &d[0], &d[1], &d[2], &d[3]); + // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); + return 1 + & (d[1] >> 5); // avx2 + } +} + BoolInt CPU_IsSupported_VAES_AVX2() { Cx86cpuid p; @@ -327,12 +351,11 @@ BoolInt CPU_IsSupported_PageGB() #ifdef _WIN32 -#include +#include -BoolInt CPU_IsSupported_CRC32() - { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } -BoolInt CPU_IsSupported_CRYPTO() - { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } +BoolInt CPU_IsSupported_CRC32() { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } +BoolInt CPU_IsSupported_CRYPTO() { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } +BoolInt CPU_IsSupported_NEON() { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } #else @@ -356,17 +379,27 @@ static void Print_sysctlbyname(const char *name) } */ -BoolInt CPU_IsSupported_CRC32(void) +static BoolInt My_sysctlbyname_Get_BoolInt(const char *name) { + UInt32 val = 0; + if (My_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1) + return 1; + return 0; +} + /* Print_sysctlbyname("hw.pagesize"); Print_sysctlbyname("machdep.cpu.brand_string"); */ - UInt32 val = 0; - if (My_sysctlbyname_Get_UInt32("hw.optional.armv8_crc32", &val) == 0 && val == 1) - return 1; - return 0; +BoolInt CPU_IsSupported_CRC32(void) +{ + return My_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32"); +} + +BoolInt CPU_IsSupported_NEON(void) +{ + return My_sysctlbyname_Get_BoolInt("hw.optional.neon"); } #ifdef MY_CPU_ARM64 @@ -390,18 +423,25 @@ BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; } #include + #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ + BoolInt CPU_IsSupported_ ## name1() { return (getauxval(AT_HWCAP) & (HWCAP_ ## name2)) ? 1 : 0; } + #ifdef MY_CPU_ARM64 #define MY_HWCAP_CHECK_FUNC(name) \ - BoolInt CPU_IsSupported_ ## name() { return (getauxval(AT_HWCAP) & (HWCAP_ ## name)) ? 1 : 0; } + MY_HWCAP_CHECK_FUNC_2(name, name) + MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD) +// MY_HWCAP_CHECK_FUNC (ASIMD) #elif defined(MY_CPU_ARM) #define MY_HWCAP_CHECK_FUNC(name) \ BoolInt CPU_IsSupported_ ## name() { return (getauxval(AT_HWCAP2) & (HWCAP2_ ## name)) ? 1 : 0; } + MY_HWCAP_CHECK_FUNC_2(NEON, NEON) #endif #else // USE_HWCAP #define MY_HWCAP_CHECK_FUNC(name) \ BoolInt CPU_IsSupported_ ## name() { return 0; } + MY_HWCAP_CHECK_FUNC(NEON) #endif // USE_HWCAP diff --git a/deps/LZMA-SDK/C/CpuArch.h b/deps/LZMA-SDK/C/CpuArch.h index e1cde536d..ba8782714 100644 --- a/deps/LZMA-SDK/C/CpuArch.h +++ b/deps/LZMA-SDK/C/CpuArch.h @@ -1,5 +1,5 @@ /* CpuArch.h -- CPU specific code -2021-04-25 : Igor Pavlov : Public domain */ +2022-07-15 : Igor Pavlov : Public domain */ #ifndef __CPU_ARCH_H #define __CPU_ARCH_H @@ -123,12 +123,15 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #endif -#if defined(__sparc64__) - #define MY_CPU_NAME "sparc64" - #define MY_CPU_64BIT -#elif defined(__sparc__) - #define MY_CPU_NAME "sparc" - /* #define MY_CPU_32BIT */ +#if defined(__riscv) \ + || defined(__riscv__) + #if __riscv_xlen == 32 + #define MY_CPU_NAME "riscv32" + #elif __riscv_xlen == 64 + #define MY_CPU_NAME "riscv64" + #else + #define MY_CPU_NAME "riscv" + #endif #endif @@ -225,7 +228,6 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #endif #else #ifdef __xlC__ - // for XLC compiler: #define MY_CPU_pragma_pack_push_1 _Pragma("pack(1)") #define MY_CPU_pragma_pop _Pragma("pack()") #else @@ -253,8 +255,12 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #ifdef MY_CPU_LE #if defined(MY_CPU_X86_OR_AMD64) \ - || defined(MY_CPU_ARM64) \ - || defined(__ARM_FEATURE_UNALIGNED) + || defined(MY_CPU_ARM64) + #define MY_CPU_LE_UNALIGN + #define MY_CPU_LE_UNALIGN_64 + #elif defined(__ARM_FEATURE_UNALIGNED) + /* gcc9 for 32-bit arm can use LDRD instruction that requires 32-bit alignment. + So we can't use unaligned 64-bit operations. */ #define MY_CPU_LE_UNALIGN #endif #endif @@ -264,11 +270,15 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #define GetUi16(p) (*(const UInt16 *)(const void *)(p)) #define GetUi32(p) (*(const UInt32 *)(const void *)(p)) +#ifdef MY_CPU_LE_UNALIGN_64 #define GetUi64(p) (*(const UInt64 *)(const void *)(p)) +#endif #define SetUi16(p, v) { *(UInt16 *)(void *)(p) = (v); } #define SetUi32(p, v) { *(UInt32 *)(void *)(p) = (v); } +#ifdef MY_CPU_LE_UNALIGN_64 #define SetUi64(p, v) { *(UInt64 *)(void *)(p) = (v); } +#endif #else @@ -282,8 +292,6 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. ((UInt32)((const Byte *)(p))[2] << 16) | \ ((UInt32)((const Byte *)(p))[3] << 24)) -#define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32)) - #define SetUi16(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \ _ppp_[0] = (Byte)_vvv_; \ _ppp_[1] = (Byte)(_vvv_ >> 8); } @@ -294,12 +302,22 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. _ppp_[2] = (Byte)(_vvv_ >> 16); \ _ppp_[3] = (Byte)(_vvv_ >> 24); } +#endif + + +#ifndef MY_CPU_LE_UNALIGN_64 + +#define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32)) + #define SetUi64(p, v) { Byte *_ppp2_ = (Byte *)(p); UInt64 _vvv2_ = (v); \ SetUi32(_ppp2_ , (UInt32)_vvv2_); \ SetUi32(_ppp2_ + 4, (UInt32)(_vvv2_ >> 32)); } #endif + + + #ifdef __has_builtin #define MY__has_builtin(x) __has_builtin(x) #else @@ -392,6 +410,7 @@ int x86cpuid_GetFirm(const Cx86cpuid *p); BoolInt CPU_Is_InOrder(void); BoolInt CPU_IsSupported_AES(void); +BoolInt CPU_IsSupported_AVX2(void); BoolInt CPU_IsSupported_VAES_AVX2(void); BoolInt CPU_IsSupported_SSSE3(void); BoolInt CPU_IsSupported_SSE41(void); @@ -401,6 +420,7 @@ BoolInt CPU_IsSupported_PageGB(void); #elif defined(MY_CPU_ARM_OR_ARM64) BoolInt CPU_IsSupported_CRC32(void); +BoolInt CPU_IsSupported_NEON(void); #if defined(_WIN32) BoolInt CPU_IsSupported_CRYPTO(void); diff --git a/deps/LZMA-SDK/C/DllSecur.c b/deps/LZMA-SDK/C/DllSecur.c index 19a22a9f0..a37c1b3e2 100644 --- a/deps/LZMA-SDK/C/DllSecur.c +++ b/deps/LZMA-SDK/C/DllSecur.c @@ -1,16 +1,20 @@ /* DllSecur.c -- DLL loading security -2018-02-21 : Igor Pavlov : Public domain */ +2022-07-15 : Igor Pavlov : Public domain */ #include "Precomp.h" #ifdef _WIN32 -#include +#include #include "DllSecur.h" #ifndef UNDER_CE +#if defined(__GNUC__) && (__GNUC__ >= 8) + #pragma GCC diagnostic ignored "-Wcast-function-type" +#endif + typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags); #define MY_LOAD_LIBRARY_SEARCH_USER_DIRS 0x400 @@ -33,17 +37,19 @@ static const char * const g_Dlls = #endif +// #define MY_CAST_FUNC (void(*)()) +#define MY_CAST_FUNC + void My_SetDefaultDllDirectories() { #ifndef UNDER_CE OSVERSIONINFO vi; vi.dwOSVersionInfoSize = sizeof(vi); - GetVersionEx(&vi); if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0) { Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories) - GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories"); + MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories"); if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; @@ -66,7 +72,7 @@ void LoadSecurityDlls() if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0) { Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories) - GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories"); + MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories"); if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; diff --git a/deps/LZMA-SDK/C/LzFind.c b/deps/LZMA-SDK/C/LzFind.c index 18ec00ef5..a17c06bc3 100644 --- a/deps/LZMA-SDK/C/LzFind.c +++ b/deps/LZMA-SDK/C/LzFind.c @@ -1,19 +1,29 @@ /* LzFind.c -- Match finder for LZ algorithms -2021-04-01 : Igor Pavlov : Public domain */ +2021-11-29 : Igor Pavlov : Public domain */ #include "Precomp.h" #include +// #include #include "CpuArch.h" #include "LzFind.h" #include "LzHash.h" +#define kBlockMoveAlign (1 << 7) // alignment for memmove() +#define kBlockSizeAlign (1 << 16) // alignment for block allocation +#define kBlockSizeReserveMin (1 << 24) // it's 1/256 from 4 GB dictinary + #define kEmptyHashValue 0 -#define kMaxValForNormalize ((UInt32)0xFFFFFFFF) -#define kNormalizeStepMin (1 << 10) /* it must be power of 2 */ -#define kNormalizeMask (~(UInt32)(kNormalizeStepMin - 1)) -#define kMaxHistorySize ((UInt32)7 << 29) + +#define kMaxValForNormalize ((UInt32)0) +// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xFFF) // for debug + +// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses + +#define GET_AVAIL_BYTES(p) \ + Inline_MatchFinder_GetNumAvailableBytes(p) + // #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size) #define kFix5HashSize kFix4HashSize @@ -64,46 +74,57 @@ static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc) } } -/* keepSizeBefore + keepSizeAfter + keepSizeReserv must be < 4G) */ -static int LzInWindow_Create(CMatchFinder *p, UInt32 keepSizeReserv, ISzAllocPtr alloc) +static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc) { - UInt32 blockSize = p->keepSizeBefore + p->keepSizeAfter + keepSizeReserv; - if (p->directInput) - { - p->blockSize = blockSize; - return 1; - } + if (blockSize == 0) + return 0; if (!p->bufferBase || p->blockSize != blockSize) { + // size_t blockSizeT; LzInWindow_Free(p, alloc); p->blockSize = blockSize; - p->bufferBase = (Byte *)ISzAlloc_Alloc(alloc, (size_t)blockSize); + // blockSizeT = blockSize; + + // printf("\nblockSize = 0x%x\n", blockSize); + /* + #if defined _WIN64 + // we can allocate 4GiB, but still use UInt32 for (p->blockSize) + // we use UInt32 type for (p->blockSize), because + // we don't want to wrap over 4 GiB, + // when we use (p->streamPos - p->pos) that is UInt32. + if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign) + { + blockSizeT = ((size_t)1 << 32); + printf("\nchanged to blockSizeT = 4GiB\n"); + } + #endif + */ + + p->bufferBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize); + // printf("\nbufferBase = %p\n", p->bufferBase); + // return 0; // for debug } return (p->bufferBase != NULL); } -static Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } +static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } -static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return p->streamPos - p->pos; } +static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); } -void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue) -{ - p->posLimit -= subValue; - p->pos -= subValue; - p->streamPos -= subValue; -} +MY_NO_INLINE static void MatchFinder_ReadBlock(CMatchFinder *p) { if (p->streamEndWasReached || p->result != SZ_OK) return; - /* We use (p->streamPos - p->pos) value. (p->streamPos < p->pos) is allowed. */ + /* We use (p->streamPos - p->pos) value. + (p->streamPos < p->pos) is allowed. */ if (p->directInput) { - UInt32 curSize = 0xFFFFFFFF - (p->streamPos - p->pos); + UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p); if (curSize > p->directInputRem) curSize = (UInt32)p->directInputRem; p->directInputRem -= curSize; @@ -115,10 +136,22 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) for (;;) { - Byte *dest = p->buffer + (p->streamPos - p->pos); + Byte *dest = p->buffer + GET_AVAIL_BYTES(p); size_t size = (size_t)(p->bufferBase + p->blockSize - dest); if (size == 0) + { + /* we call ReadBlock() after NeedMove() and MoveBlock(). + NeedMove() and MoveBlock() povide more than (keepSizeAfter) + to the end of (blockSize). + So we don't execute this branch in normal code flow. + We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock(). + */ + // p->result = SZ_ERROR_FAIL; // we can show error here return; + } + + // #define kRead 3 + // if (size > kRead) size = kRead; // for debug p->result = ISeqInStream_Read(p->stream, dest, &size); if (p->result != SZ_OK) @@ -129,41 +162,52 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) return; } p->streamPos += (UInt32)size; - if (p->streamPos - p->pos > p->keepSizeAfter) + if (GET_AVAIL_BYTES(p) > p->keepSizeAfter) return; + /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function + (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */ } + + // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter) } + + +MY_NO_INLINE void MatchFinder_MoveBlock(CMatchFinder *p) { + const size_t offset = (size_t)(p->buffer - p->bufferBase) - p->keepSizeBefore; + const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore; + p->buffer = p->bufferBase + keepBefore; memmove(p->bufferBase, - p->buffer - p->keepSizeBefore, - (size_t)(p->streamPos - p->pos) + p->keepSizeBefore); - p->buffer = p->bufferBase + p->keepSizeBefore; + p->bufferBase + (offset & ~((size_t)kBlockMoveAlign - 1)), + keepBefore + (size_t)GET_AVAIL_BYTES(p)); } +/* We call MoveBlock() before ReadBlock(). + So MoveBlock() can be wasteful operation, if the whole input data + can fit in current block even without calling MoveBlock(). + in important case where (dataSize <= historySize) + condition (p->blockSize > dataSize + p->keepSizeAfter) is met + So there is no MoveBlock() in that case case. +*/ + int MatchFinder_NeedMove(CMatchFinder *p) { if (p->directInput) return 0; - /* if (p->streamEndWasReached) return 0; */ + if (p->streamEndWasReached || p->result != SZ_OK) + return 0; return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter); } void MatchFinder_ReadIfRequired(CMatchFinder *p) { - if (p->streamEndWasReached) - return; - if (p->keepSizeAfter >= p->streamPos - p->pos) + if (p->keepSizeAfter >= GET_AVAIL_BYTES(p)) MatchFinder_ReadBlock(p); } -static void MatchFinder_CheckAndMoveAndRead(CMatchFinder *p) -{ - if (MatchFinder_NeedMove(p)) - MatchFinder_MoveBlock(p); - MatchFinder_ReadBlock(p); -} + static void MatchFinder_SetDefaultSettings(CMatchFinder *p) { @@ -214,32 +258,67 @@ static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc) return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes); } -int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, - UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, - ISzAllocPtr alloc) +#if (kBlockSizeReserveMin < kBlockSizeAlign * 2) + #error Stop_Compiling_Bad_Reserve +#endif + + + +static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize) { - UInt32 sizeReserv; - + UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter); + /* if (historySize > kMaxHistorySize) - { - MatchFinder_Free(p, alloc); return 0; - } + */ + // printf("\nhistorySize == 0x%x\n", historySize); - sizeReserv = historySize >> 1; - if (historySize >= ((UInt32)3 << 30)) sizeReserv = historySize >> 3; - else if (historySize >= ((UInt32)2 << 30)) sizeReserv = historySize >> 2; + if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore) // if 32-bit overflow + return 0; - sizeReserv += (keepAddBufferBefore + matchMaxLen + keepAddBufferAfter) / 2 + (1 << 19); + { + const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign; + const UInt32 rem = kBlockSizeMax - blockSize; + const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2)) + + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here + if (blockSize >= kBlockSizeMax + || rem < kBlockSizeReserveMin) // we reject settings that will be slow + return 0; + if (reserve >= rem) + blockSize = kBlockSizeMax; + else + { + blockSize += reserve; + blockSize &= ~(UInt32)(kBlockSizeAlign - 1); + } + } + // printf("\n LzFind_blockSize = %x\n", blockSize); + // printf("\n LzFind_blockSize = %d\n", blockSize >> 20); + return blockSize; +} + +int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, + UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, + ISzAllocPtr alloc) +{ + /* we need one additional byte in (p->keepSizeBefore), + since we use MoveBlock() after (p->pos++) and before dictionary using */ + // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug p->keepSizeBefore = historySize + keepAddBufferBefore + 1; - p->keepSizeAfter = matchMaxLen + keepAddBufferAfter; - - /* we need one additional byte, since we use MoveBlock after pos++ and before dictionary using */ - - if (LzInWindow_Create(p, sizeReserv, alloc)) + + keepAddBufferAfter += matchMaxLen; + /* we need (p->keepSizeAfter >= p->numHashBytes) */ + if (keepAddBufferAfter < p->numHashBytes) + keepAddBufferAfter = p->numHashBytes; + // keepAddBufferAfter -= 2; // for debug + p->keepSizeAfter = keepAddBufferAfter; + + if (p->directInput) + p->blockSize = 0; + if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc)) { - UInt32 newCyclicBufferSize = historySize + 1; + const UInt32 newCyclicBufferSize = historySize + 1; // do not change it UInt32 hs; p->matchMaxLen = matchMaxLen; { @@ -299,7 +378,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, size_t numSons; p->historySize = historySize; p->hashSizeSum = hs; - p->cyclicBufferSize = newCyclicBufferSize; + p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1) numSons = newCyclicBufferSize; if (p->btMode) @@ -329,33 +408,43 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, return 0; } + static void MatchFinder_SetLimits(CMatchFinder *p) { - UInt32 limit = kMaxValForNormalize - p->pos; - UInt32 limit2 = p->cyclicBufferSize - p->cyclicBufferPos; - - if (limit2 < limit) - limit = limit2; - limit2 = p->streamPos - p->pos; + UInt32 k; + UInt32 n = kMaxValForNormalize - p->pos; + if (n == 0) + n = (UInt32)(Int32)-1; // we allow (pos == 0) at start even with (kMaxValForNormalize == 0) - if (limit2 <= p->keepSizeAfter) + k = p->cyclicBufferSize - p->cyclicBufferPos; + if (k < n) + n = k; + + k = GET_AVAIL_BYTES(p); { - if (limit2 > 0) - limit2 = 1; + const UInt32 ksa = p->keepSizeAfter; + UInt32 mm = p->matchMaxLen; + if (k > ksa) + k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock + else if (k >= mm) + { + // the limitation for (p->lenLimit) update + k -= mm; // optimization : to reduce the number of checks + k++; + // k = 1; // non-optimized version : for debug + } + else + { + mm = k; + if (k != 0) + k = 1; + } + p->lenLimit = mm; } - else - limit2 -= p->keepSizeAfter; - - if (limit2 < limit) - limit = limit2; + if (k < n) + n = k; - { - UInt32 lenLimit = p->streamPos - p->pos; - if (lenLimit > p->matchMaxLen) - lenLimit = p->matchMaxLen; - p->lenLimit = lenLimit; - } - p->posLimit = p->pos + limit; + p->posLimit = p->pos + n; } @@ -363,7 +452,7 @@ void MatchFinder_Init_LowHash(CMatchFinder *p) { size_t i; CLzRef *items = p->hash; - size_t numItems = p->fixedHashSize; + const size_t numItems = p->fixedHashSize; for (i = 0; i < numItems; i++) items[i] = kEmptyHashValue; } @@ -373,80 +462,322 @@ void MatchFinder_Init_HighHash(CMatchFinder *p) { size_t i; CLzRef *items = p->hash + p->fixedHashSize; - size_t numItems = (size_t)p->hashMask + 1; + const size_t numItems = (size_t)p->hashMask + 1; for (i = 0; i < numItems; i++) items[i] = kEmptyHashValue; } -void MatchFinder_Init_3(CMatchFinder *p, int readData) +void MatchFinder_Init_4(CMatchFinder *p) { - p->cyclicBufferPos = 0; p->buffer = p->bufferBase; - p->pos = - p->streamPos = p->cyclicBufferSize; + { + /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker. + the code in CMatchFinderMt expects (pos = 1) */ + p->pos = + p->streamPos = + 1; // it's smallest optimal value. do not change it + // 0; // for debug + } p->result = SZ_OK; p->streamEndWasReached = 0; - - if (readData) - MatchFinder_ReadBlock(p); - - MatchFinder_SetLimits(p); } +// (CYC_TO_POS_OFFSET == 0) is expected by some optimized code +#define CYC_TO_POS_OFFSET 0 +// #define CYC_TO_POS_OFFSET 1 // for debug + void MatchFinder_Init(CMatchFinder *p) { MatchFinder_Init_HighHash(p); MatchFinder_Init_LowHash(p); - MatchFinder_Init_3(p, True); + MatchFinder_Init_4(p); + // if (readData) + MatchFinder_ReadBlock(p); + + /* if we init (cyclicBufferPos = pos), then we can use one variable + instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */ + p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos) + // p->cyclicBufferPos = 0; // smallest value + // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses. + MatchFinder_SetLimits(p); } - -static UInt32 MatchFinder_GetSubValue(CMatchFinder *p) + + +#ifdef MY_CPU_X86_OR_AMD64 + #if defined(__clang__) && (__clang_major__ >= 8) \ + || defined(__GNUC__) && (__GNUC__ >= 8) \ + || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) + #define USE_SATUR_SUB_128 + #define USE_AVX2 + #define ATTRIB_SSE41 __attribute__((__target__("sse4.1"))) + #define ATTRIB_AVX2 __attribute__((__target__("avx2"))) + #elif defined(_MSC_VER) + #if (_MSC_VER >= 1600) + #define USE_SATUR_SUB_128 + #if (_MSC_VER >= 1900) + #define USE_AVX2 + #include // avx + #endif + #endif + #endif + +// #elif defined(MY_CPU_ARM_OR_ARM64) +#elif defined(MY_CPU_ARM64) + + #if defined(__clang__) && (__clang_major__ >= 8) \ + || defined(__GNUC__) && (__GNUC__ >= 8) + #define USE_SATUR_SUB_128 + #ifdef MY_CPU_ARM64 + // #define ATTRIB_SSE41 __attribute__((__target__(""))) + #else + // #define ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) + #endif + + #elif defined(_MSC_VER) + #if (_MSC_VER >= 1910) + #define USE_SATUR_SUB_128 + #endif + #endif + + #if defined(_MSC_VER) && defined(MY_CPU_ARM64) + #include + #else + #include + #endif + +#endif + +/* +#ifndef ATTRIB_SSE41 + #define ATTRIB_SSE41 +#endif +#ifndef ATTRIB_AVX2 + #define ATTRIB_AVX2 +#endif +*/ + +#ifdef USE_SATUR_SUB_128 + +// #define _SHOW_HW_STATUS + +#ifdef _SHOW_HW_STATUS +#include +#define _PRF(x) x +_PRF(;) +#else +#define _PRF(x) +#endif + +#ifdef MY_CPU_ARM_OR_ARM64 + +#ifdef MY_CPU_ARM64 +// #define FORCE_SATUR_SUB_128 +#endif + +typedef uint32x4_t v128; +#define SASUB_128(i) \ + *(v128 *)(void *)(items + (i) * 4) = \ + vsubq_u32(vmaxq_u32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2); + +#else + +#include // sse4.1 + +typedef __m128i v128; +#define SASUB_128(i) \ + *(v128 *)(void *)(items + (i) * 4) = \ + _mm_sub_epi32(_mm_max_epu32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2); // SSE 4.1 + +#endif + + + +MY_NO_INLINE +static +#ifdef ATTRIB_SSE41 +ATTRIB_SSE41 +#endif +void +MY_FAST_CALL +LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim) { - return (p->pos - p->historySize - 1) & kNormalizeMask; + v128 sub2 = + #ifdef MY_CPU_ARM_OR_ARM64 + vdupq_n_u32(subValue); + #else + _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); + #endif + do + { + SASUB_128(0) + SASUB_128(1) + SASUB_128(2) + SASUB_128(3) + items += 4 * 4; + } + while (items != lim); } -void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) + + +#ifdef USE_AVX2 + +#include // avx + +#define SASUB_256(i) *(__m256i *)(void *)(items + (i) * 8) = _mm256_sub_epi32(_mm256_max_epu32(*(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2); // AVX2 + +MY_NO_INLINE +static +#ifdef ATTRIB_AVX2 +ATTRIB_AVX2 +#endif +void +MY_FAST_CALL +LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim) { - if (numItems == 0) - return; - { - const CLzRef *lim = items + numItems - 1; - for (; items < lim; items += 2) + __m256i sub2 = _mm256_set_epi32( + (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue, + (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); + do { - UInt32 v, m; - v = items[0]; m = v - subValue; if (v < subValue) m = kEmptyHashValue; - v = items[1]; items[0] = m; m = v - subValue; if (v < subValue) m = kEmptyHashValue; - items[1] = m; + SASUB_256(0) + SASUB_256(1) + items += 2 * 8; } - if (items == lim) + while (items != lim); +} +#endif // USE_AVX2 + +#ifndef FORCE_SATUR_SUB_128 +typedef void (MY_FAST_CALL *LZFIND_SATUR_SUB_CODE_FUNC)( + UInt32 subValue, CLzRef *items, const CLzRef *lim); +static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub; +#endif // FORCE_SATUR_SUB_128 + +#endif // USE_SATUR_SUB_128 + + +// kEmptyHashValue must be zero +// #define SASUB_32(i) v = items[i]; m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; +#define SASUB_32(i) v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; + +#ifdef FORCE_SATUR_SUB_128 + +#define DEFAULT_SaturSub LzFind_SaturSub_128 + +#else + +#define DEFAULT_SaturSub LzFind_SaturSub_32 + +MY_NO_INLINE +static +void +MY_FAST_CALL +LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim) +{ + do { - UInt32 v, m; - v = items[0]; m = v - subValue; if (v < subValue) m = kEmptyHashValue; - items[0] = m; - } + UInt32 v; + SASUB_32(0) + SASUB_32(1) + SASUB_32(2) + SASUB_32(3) + SASUB_32(4) + SASUB_32(5) + SASUB_32(6) + SASUB_32(7) + items += 8; } + while (items != lim); } -static void MatchFinder_Normalize(CMatchFinder *p) +#endif + + +MY_NO_INLINE +void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) { - UInt32 subValue = MatchFinder_GetSubValue(p); - MatchFinder_Normalize3(subValue, p->hash, p->numRefs); - MatchFinder_ReduceOffsets(p, subValue); + #define K_NORM_ALIGN_BLOCK_SIZE (1 << 6) + + CLzRef *lim; + + for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (K_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--) + { + UInt32 v; + SASUB_32(0); + items++; + } + + { + #define K_NORM_ALIGN_MASK (K_NORM_ALIGN_BLOCK_SIZE / 4 - 1) + lim = items + (numItems & ~(size_t)K_NORM_ALIGN_MASK); + numItems &= K_NORM_ALIGN_MASK; + if (items != lim) + { + #if defined(USE_SATUR_SUB_128) && !defined(FORCE_SATUR_SUB_128) + if (g_LzFind_SaturSub) + g_LzFind_SaturSub(subValue, items, lim); + else + #endif + DEFAULT_SaturSub(subValue, items, lim); + } + items = lim; + } + + + for (; numItems != 0; numItems--) + { + UInt32 v; + SASUB_32(0); + items++; + } } + +// call MatchFinder_CheckLimits() only after (p->pos++) update + MY_NO_INLINE static void MatchFinder_CheckLimits(CMatchFinder *p) { + if (// !p->streamEndWasReached && p->result == SZ_OK && + p->keepSizeAfter == GET_AVAIL_BYTES(p)) + { + // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p)) + if (MatchFinder_NeedMove(p)) + MatchFinder_MoveBlock(p); + MatchFinder_ReadBlock(p); + } + if (p->pos == kMaxValForNormalize) - MatchFinder_Normalize(p); - if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos) - MatchFinder_CheckAndMoveAndRead(p); + if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data. + /* + if we disable normalization for last bytes of data, and + if (data_size == 4 GiB), we don't call wastfull normalization, + but (pos) will be wrapped over Zero (0) in that case. + And we cannot resume later to normal operation + */ + { + // MatchFinder_Normalize(p); + /* after normalization we need (p->pos >= p->historySize + 1); */ + /* we can reduce subValue to aligned value, if want to keep alignment + of (p->pos) and (p->buffer) for speculated accesses. */ + const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */; + // const UInt32 subValue = (1 << 15); // for debug + // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue); + size_t numSonRefs = p->cyclicBufferSize; + if (p->btMode) + numSonRefs <<= 1; + Inline_MatchFinder_ReduceOffsets(p, subValue); + MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashSizeSum + numSonRefs); + } + if (p->cyclicBufferPos == p->cyclicBufferSize) p->cyclicBufferPos = 0; + MatchFinder_SetLimits(p); } @@ -455,9 +786,9 @@ static void MatchFinder_CheckLimits(CMatchFinder *p) (lenLimit > maxLen) */ MY_FORCE_INLINE -static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, - UInt32 *distances, unsigned maxLen) +static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, + UInt32 *d, unsigned maxLen) { /* son[_cyclicBufferPos] = curMatch; @@ -465,7 +796,7 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos { UInt32 delta = pos - curMatch; if (cutValue-- == 0 || delta >= _cyclicBufferSize) - return distances; + return d; { const Byte *pb = cur - delta; curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; @@ -478,10 +809,10 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos if (maxLen < len) { maxLen = len; - *distances++ = len; - *distances++ = delta - 1; + *d++ = len; + *d++ = delta - 1; if (len == lenLimit) - return distances; + return d; } } } @@ -490,9 +821,15 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos const Byte *lim = cur + lenLimit; son[_cyclicBufferPos] = curMatch; + do { - UInt32 delta = pos - curMatch; + UInt32 delta; + + if (curMatch == 0) + break; + // if (curMatch2 >= curMatch) return NULL; + delta = pos - curMatch; if (delta >= _cyclicBufferSize) break; { @@ -506,19 +843,19 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos { if (++c == lim) { - distances[0] = (UInt32)(lim - cur); - distances[1] = delta - 1; - return distances + 2; + d[0] = (UInt32)(lim - cur); + d[1] = delta - 1; + return d + 2; } } { - unsigned len = (unsigned)(c - cur); + const unsigned len = (unsigned)(c - cur); if (maxLen < len) { maxLen = len; - distances[0] = (UInt32)len; - distances[1] = delta - 1; - distances += 2; + d[0] = (UInt32)len; + d[1] = delta - 1; + d += 2; } } } @@ -526,31 +863,36 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos } while (--cutValue); - return distances; + return d; } MY_FORCE_INLINE UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, - UInt32 *distances, UInt32 maxLen) + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, + UInt32 *d, UInt32 maxLen) { CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); unsigned len0 = 0, len1 = 0; - for (;;) + + UInt32 cmCheck; + + // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } + + cmCheck = (UInt32)(pos - _cyclicBufferSize); + if ((UInt32)pos <= _cyclicBufferSize) + cmCheck = 0; + + if (cmCheck < curMatch) + do { - UInt32 delta = pos - curMatch; - if (cutValue-- == 0 || delta >= _cyclicBufferSize) - { - *ptr0 = *ptr1 = kEmptyHashValue; - return distances; - } + const UInt32 delta = pos - curMatch; { CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); const Byte *pb = cur - delta; unsigned len = (len0 < len1 ? len0 : len1); - UInt32 pair0 = pair[0]; + const UInt32 pair0 = pair[0]; if (pb[len] == cur[len]) { if (++len != lenLimit && pb[len] == cur[len]) @@ -560,48 +902,60 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt if (maxLen < len) { maxLen = (UInt32)len; - *distances++ = (UInt32)len; - *distances++ = delta - 1; + *d++ = (UInt32)len; + *d++ = delta - 1; if (len == lenLimit) { *ptr1 = pair0; *ptr0 = pair[1]; - return distances; + return d; } } } if (pb[len] < cur[len]) { *ptr1 = curMatch; + // const UInt32 curMatch2 = pair[1]; + // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } + // curMatch = curMatch2; + curMatch = pair[1]; ptr1 = pair + 1; - curMatch = *ptr1; len1 = len; } else { *ptr0 = curMatch; + curMatch = pair[0]; ptr0 = pair; - curMatch = *ptr0; len0 = len; } } } + while(--cutValue && cmCheck < curMatch); + + *ptr0 = *ptr1 = kEmptyHashValue; + return d; } + static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue) + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue) { CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); unsigned len0 = 0, len1 = 0; - for (;;) + + UInt32 cmCheck; + + cmCheck = (UInt32)(pos - _cyclicBufferSize); + if ((UInt32)pos <= _cyclicBufferSize) + cmCheck = 0; + + if (// curMatch >= pos || // failure + cmCheck < curMatch) + do { - UInt32 delta = pos - curMatch; - if (cutValue-- == 0 || delta >= _cyclicBufferSize) - { - *ptr0 = *ptr1 = kEmptyHashValue; - return; - } + const UInt32 delta = pos - curMatch; { CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); const Byte *pb = cur - delta; @@ -623,43 +977,62 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const if (pb[len] < cur[len]) { *ptr1 = curMatch; + curMatch = pair[1]; ptr1 = pair + 1; - curMatch = *ptr1; len1 = len; } else { *ptr0 = curMatch; + curMatch = pair[0]; ptr0 = pair; - curMatch = *ptr0; len0 = len; } } } + while(--cutValue && cmCheck < curMatch); + + *ptr0 = *ptr1 = kEmptyHashValue; + return; } + #define MOVE_POS \ ++p->cyclicBufferPos; \ p->buffer++; \ - if (++p->pos == p->posLimit) MatchFinder_CheckLimits(p); + { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } -#define MOVE_POS_RET MOVE_POS return (UInt32)offset; +#define MOVE_POS_RET MOVE_POS return distances; -static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; } +MY_NO_INLINE +static void MatchFinder_MovePos(CMatchFinder *p) +{ + /* we go here at the end of stream data, when (avail < num_hash_bytes) + We don't update sons[cyclicBufferPos << btMode]. + So (sons) record will contain junk. And we cannot resume match searching + to normal operation, even if we will provide more input data in buffer. + p->sons[p->cyclicBufferPos << p->btMode] = 0; // kEmptyHashValue + if (p->btMode) + p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue + */ + MOVE_POS; +} #define GET_MATCHES_HEADER2(minLen, ret_op) \ - unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \ + unsigned lenLimit; UInt32 hv; Byte *cur; UInt32 curMatch; \ lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ cur = p->buffer; -#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return 0) -#define SKIP_HEADER(minLen) GET_MATCHES_HEADER2(minLen, continue) +#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances) +#define SKIP_HEADER(minLen) do { GET_MATCHES_HEADER2(minLen, continue) + +#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue -#define MF_PARAMS(p) p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue +#define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS; } while (--num); #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ - offset = (unsigned)(func((UInt32)lenLimit, curMatch, MF_PARAMS(p), \ - distances + offset, (UInt32)(_maxLen_)) - distances); MOVE_POS_RET; + distances = func(MF_PARAMS(p), \ + distances, (UInt32)_maxLen_); MOVE_POS_RET; #define GET_MATCHES_FOOTER_BT(_maxLen_) \ GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) @@ -667,42 +1040,45 @@ static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; } #define GET_MATCHES_FOOTER_HC(_maxLen_) \ GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec) -#define SKIP_FOOTER \ - SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p)); MOVE_POS; + #define UPDATE_maxLen { \ - ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \ + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \ const Byte *c = cur + maxLen; \ const Byte *lim = cur + lenLimit; \ for (; c != lim; c++) if (*(c + diff) != *c) break; \ maxLen = (unsigned)(c - cur); } -static UInt32 Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - unsigned offset; GET_MATCHES_HEADER(2) HASH2_CALC; curMatch = p->hash[hv]; p->hash[hv] = p->pos; - offset = 0; GET_MATCHES_FOOTER_BT(1) } -UInt32 Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - unsigned offset; GET_MATCHES_HEADER(3) HASH_ZIP_CALC; curMatch = p->hash[hv]; p->hash[hv] = p->pos; - offset = 0; GET_MATCHES_FOOTER_BT(2) } -static UInt32 Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +#define SET_mmm \ + mmm = p->cyclicBufferSize; \ + if (pos < mmm) \ + mmm = pos; + + +static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { + UInt32 mmm; UInt32 h2, d2, pos; - unsigned maxLen, offset; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(3) @@ -718,18 +1094,19 @@ static UInt32 Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) hash[h2] = pos; (hash + kFix3HashSize)[hv] = pos; + SET_mmm + maxLen = 2; - offset = 0; - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + if (d2 < mmm && *(cur - d2) == *cur) { UPDATE_maxLen distances[0] = (UInt32)maxLen; distances[1] = d2 - 1; - offset = 2; + distances += 2; if (maxLen == lenLimit) { - SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p)); + SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS_RET; } } @@ -738,10 +1115,11 @@ static UInt32 Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) } -static UInt32 Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { + UInt32 mmm; UInt32 h2, h3, d2, d3, pos; - unsigned maxLen, offset; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(4) @@ -758,43 +1136,44 @@ static UInt32 Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) (hash + kFix3HashSize)[h3] = pos; (hash + kFix4HashSize)[hv] = pos; + SET_mmm + maxLen = 3; - offset = 0; for (;;) { - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + if (d2 < mmm && *(cur - d2) == *cur) { distances[0] = 2; distances[1] = d2 - 1; - offset = 2; + distances += 2; if (*(cur - d2 + 2) == cur[2]) { - // distances[0] = 3; + // distances[-2] = 3; } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + else if (d3 < mmm && *(cur - d3) == *cur) { d2 = d3; - distances[2 + 1] = d3 - 1; - offset = 4; + distances[1] = d3 - 1; + distances += 2; } else break; } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + else if (d3 < mmm && *(cur - d3) == *cur) { d2 = d3; distances[1] = d3 - 1; - offset = 2; + distances += 2; } else break; UPDATE_maxLen - distances[(size_t)offset - 2] = (UInt32)maxLen; + distances[-2] = (UInt32)maxLen; if (maxLen == lenLimit) { - SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p)); + SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS_RET } break; @@ -804,9 +1183,10 @@ static UInt32 Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) } -static UInt32 Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - UInt32 h2, h3, d2, d3, maxLen, offset, pos; + UInt32 mmm; + UInt32 h2, h3, d2, d3, maxLen, pos; UInt32 *hash; GET_MATCHES_HEADER(5) @@ -826,45 +1206,46 @@ static UInt32 Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) // (hash + kFix4HashSize)[h4] = pos; (hash + kFix5HashSize)[hv] = pos; + SET_mmm + maxLen = 4; - offset = 0; for (;;) { - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + if (d2 < mmm && *(cur - d2) == *cur) { distances[0] = 2; distances[1] = d2 - 1; - offset = 2; + distances += 2; if (*(cur - d2 + 2) == cur[2]) { } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + else if (d3 < mmm && *(cur - d3) == *cur) { - distances[3] = d3 - 1; - offset = 4; + distances[1] = d3 - 1; + distances += 2; d2 = d3; } else break; } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + else if (d3 < mmm && *(cur - d3) == *cur) { distances[1] = d3 - 1; - offset = 2; + distances += 2; d2 = d3; } else break; - distances[(size_t)offset - 2] = 3; + distances[-2] = 3; if (*(cur - d2 + 3) != cur[3]) break; UPDATE_maxLen - distances[(size_t)offset - 2] = maxLen; + distances[-2] = (UInt32)maxLen; if (maxLen == lenLimit) { - SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); + SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS_RET; } break; @@ -874,10 +1255,11 @@ static UInt32 Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) } -static UInt32 Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { + UInt32 mmm; UInt32 h2, h3, d2, d3, pos; - unsigned maxLen, offset; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(4) @@ -894,40 +1276,41 @@ static UInt32 Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) (hash + kFix3HashSize)[h3] = pos; (hash + kFix4HashSize)[hv] = pos; + SET_mmm + maxLen = 3; - offset = 0; for (;;) { - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + if (d2 < mmm && *(cur - d2) == *cur) { distances[0] = 2; distances[1] = d2 - 1; - offset = 2; + distances += 2; if (*(cur - d2 + 2) == cur[2]) { - // distances[0] = 3; + // distances[-2] = 3; } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + else if (d3 < mmm && *(cur - d3) == *cur) { d2 = d3; - distances[2 + 1] = d3 - 1; - offset = 4; + distances[1] = d3 - 1; + distances += 2; } else break; } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + else if (d3 < mmm && *(cur - d3) == *cur) { d2 = d3; distances[1] = d3 - 1; - offset = 2; + distances += 2; } else break; UPDATE_maxLen - distances[(size_t)offset - 2] = (UInt32)maxLen; + distances[-2] = (UInt32)maxLen; if (maxLen == lenLimit) { p->son[p->cyclicBufferPos] = curMatch; @@ -940,9 +1323,10 @@ static UInt32 Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) } -static UInt32 Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - UInt32 h2, h3, d2, d3, maxLen, offset, pos; + UInt32 mmm; + UInt32 h2, h3, d2, d3, maxLen, pos; UInt32 *hash; GET_MATCHES_HEADER(5) @@ -962,42 +1346,43 @@ static UInt32 Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) // (hash + kFix4HashSize)[h4] = pos; (hash + kFix5HashSize)[hv] = pos; + SET_mmm + maxLen = 4; - offset = 0; for (;;) { - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + if (d2 < mmm && *(cur - d2) == *cur) { distances[0] = 2; distances[1] = d2 - 1; - offset = 2; + distances += 2; if (*(cur - d2 + 2) == cur[2]) { } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + else if (d3 < mmm && *(cur - d3) == *cur) { - distances[3] = d3 - 1; - offset = 4; + distances[1] = d3 - 1; + distances += 2; d2 = d3; } else break; } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + else if (d3 < mmm && *(cur - d3) == *cur) { distances[1] = d3 - 1; - offset = 2; + distances += 2; d2 = d3; } else break; - distances[(size_t)offset - 2] = 3; + distances[-2] = 3; if (*(cur - d2 + 3) != cur[3]) break; UPDATE_maxLen - distances[(size_t)offset - 2] = maxLen; + distances[-2] = maxLen; if (maxLen == lenLimit) { p->son[p->cyclicBufferPos] = curMatch; @@ -1010,86 +1395,75 @@ static UInt32 Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) } -UInt32 Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - unsigned offset; GET_MATCHES_HEADER(3) HASH_ZIP_CALC; curMatch = p->hash[hv]; p->hash[hv] = p->pos; - offset = 0; GET_MATCHES_FOOTER_HC(2) } static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do + SKIP_HEADER(2) { - SKIP_HEADER(2) HASH2_CALC; curMatch = p->hash[hv]; p->hash[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do + SKIP_HEADER(3) { - SKIP_HEADER(3) HASH_ZIP_CALC; curMatch = p->hash[hv]; p->hash[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do + SKIP_HEADER(3) { UInt32 h2; UInt32 *hash; - SKIP_HEADER(3) HASH3_CALC; hash = p->hash; curMatch = (hash + kFix3HashSize)[hv]; hash[h2] = (hash + kFix3HashSize)[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do + SKIP_HEADER(4) { UInt32 h2, h3; UInt32 *hash; - SKIP_HEADER(4) HASH4_CALC; hash = p->hash; curMatch = (hash + kFix4HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = (hash + kFix4HashSize)[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do + SKIP_HEADER(5) { UInt32 h2, h3; UInt32 *hash; - SKIP_HEADER(5) HASH5_CALC; hash = p->hash; curMatch = (hash + kFix5HashSize)[hv]; @@ -1097,66 +1471,84 @@ static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) (hash + kFix3HashSize)[h3] = // (hash + kFix4HashSize)[h4] = (hash + kFix5HashSize)[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } + +#define HC_SKIP_HEADER(minLen) \ + do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \ + Byte *cur; \ + UInt32 *hash; \ + UInt32 *son; \ + UInt32 pos = p->pos; \ + UInt32 num2 = num; \ + /* (p->pos == p->posLimit) is not allowed here !!! */ \ + { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \ + num -= num2; \ + { const UInt32 cycPos = p->cyclicBufferPos; \ + son = p->son + cycPos; \ + p->cyclicBufferPos = cycPos + num2; } \ + cur = p->buffer; \ + hash = p->hash; \ + do { \ + UInt32 curMatch; \ + UInt32 hv; + + +#define HC_SKIP_FOOTER \ + cur++; pos++; *son++ = curMatch; \ + } while (--num2); \ + p->buffer = cur; \ + p->pos = pos; \ + if (pos == p->posLimit) MatchFinder_CheckLimits(p); \ + }} while(num); \ + + static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do - { + HC_SKIP_HEADER(4) + UInt32 h2, h3; - UInt32 *hash; - SKIP_HEADER(4) HASH4_CALC; - hash = p->hash; curMatch = (hash + kFix4HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = - (hash + kFix4HashSize)[hv] = p->pos; - p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS - } - while (--num != 0); + (hash + kFix4HashSize)[hv] = pos; + + HC_SKIP_FOOTER } + static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do - { + HC_SKIP_HEADER(5) + UInt32 h2, h3; - UInt32 *hash; - SKIP_HEADER(5) - HASH5_CALC; - hash = p->hash; + HASH5_CALC curMatch = (hash + kFix5HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = // (hash + kFix4HashSize)[h4] = - (hash + kFix5HashSize)[hv] = p->pos; - p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS - } - while (--num != 0); + (hash + kFix5HashSize)[hv] = pos; + + HC_SKIP_FOOTER } void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do - { - SKIP_HEADER(3) + HC_SKIP_HEADER(3) + HASH_ZIP_CALC; - curMatch = p->hash[hv]; - p->hash[hv] = p->pos; - p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS - } - while (--num != 0); + curMatch = hash[hv]; + hash[hv] = pos; + + HC_SKIP_FOOTER } -void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable) + +void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) { vTable->Init = (Mf_Init_Func)MatchFinder_Init; vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; @@ -1195,3 +1587,42 @@ void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable) vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip; } } + + + +void LzFindPrepare() +{ + #ifndef FORCE_SATUR_SUB_128 + #ifdef USE_SATUR_SUB_128 + LZFIND_SATUR_SUB_CODE_FUNC f = NULL; + #ifdef MY_CPU_ARM_OR_ARM64 + { + if (CPU_IsSupported_NEON()) + { + // #pragma message ("=== LzFind NEON") + _PRF(printf("\n=== LzFind NEON\n")); + f = LzFind_SaturSub_128; + } + // f = 0; // for debug + } + #else // MY_CPU_ARM_OR_ARM64 + if (CPU_IsSupported_SSE41()) + { + // #pragma message ("=== LzFind SSE41") + _PRF(printf("\n=== LzFind SSE41\n")); + f = LzFind_SaturSub_128; + + #ifdef USE_AVX2 + if (CPU_IsSupported_AVX2()) + { + // #pragma message ("=== LzFind AVX2") + _PRF(printf("\n=== LzFind AVX2\n")); + f = LzFind_SaturSub_256; + } + #endif + } + #endif // MY_CPU_ARM_OR_ARM64 + g_LzFind_SaturSub = f; + #endif // USE_SATUR_SUB_128 + #endif // FORCE_SATUR_SUB_128 +} diff --git a/deps/LZMA-SDK/C/LzFind.h b/deps/LZMA-SDK/C/LzFind.h index 3e2248e7d..8f9fade23 100644 --- a/deps/LZMA-SDK/C/LzFind.h +++ b/deps/LZMA-SDK/C/LzFind.h @@ -1,5 +1,5 @@ /* LzFind.h -- Match finder for LZ algorithms -2021-02-09 : Igor Pavlov : Public domain */ +2021-07-13 : Igor Pavlov : Public domain */ #ifndef __LZ_FIND_H #define __LZ_FIND_H @@ -15,7 +15,7 @@ typedef struct _CMatchFinder Byte *buffer; UInt32 pos; UInt32 posLimit; - UInt32 streamPos; + UInt32 streamPos; /* wrap over Zero is allowed (streamPos < pos). Use (UInt32)(streamPos - pos) */ UInt32 lenLimit; UInt32 cyclicBufferPos; @@ -51,17 +51,19 @@ typedef struct _CMatchFinder UInt64 expectedDataSize; } CMatchFinder; -#define Inline_MatchFinder_GetPointerToCurrentPos(p) ((p)->buffer) +#define Inline_MatchFinder_GetPointerToCurrentPos(p) ((const Byte *)(p)->buffer) -#define Inline_MatchFinder_GetNumAvailableBytes(p) ((p)->streamPos - (p)->pos) +#define Inline_MatchFinder_GetNumAvailableBytes(p) ((UInt32)((p)->streamPos - (p)->pos)) +/* #define Inline_MatchFinder_IsFinishedOK(p) \ ((p)->streamEndWasReached \ && (p)->streamPos == (p)->pos \ && (!(p)->directInput || (p)->directInputRem == 0)) +*/ int MatchFinder_NeedMove(CMatchFinder *p); -// Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p); +/* Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p); */ void MatchFinder_MoveBlock(CMatchFinder *p); void MatchFinder_ReadIfRequired(CMatchFinder *p); @@ -76,10 +78,21 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, ISzAllocPtr alloc); void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc); void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems); -void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue); +// void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue); + +/* +#define Inline_MatchFinder_InitPos(p, val) \ + (p)->pos = (val); \ + (p)->streamPos = (val); +*/ + +#define Inline_MatchFinder_ReduceOffsets(p, subValue) \ + (p)->pos -= (subValue); \ + (p)->streamPos -= (subValue); + UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *buffer, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue, UInt32 *distances, UInt32 maxLen); /* @@ -91,7 +104,7 @@ Conditions: typedef void (*Mf_Init_Func)(void *object); typedef UInt32 (*Mf_GetNumAvailableBytes_Func)(void *object); typedef const Byte * (*Mf_GetPointerToCurrentPos_Func)(void *object); -typedef UInt32 (*Mf_GetMatches_Func)(void *object, UInt32 *distances); +typedef UInt32 * (*Mf_GetMatches_Func)(void *object, UInt32 *distances); typedef void (*Mf_Skip_Func)(void *object, UInt32); typedef struct _IMatchFinder @@ -101,21 +114,23 @@ typedef struct _IMatchFinder Mf_GetPointerToCurrentPos_Func GetPointerToCurrentPos; Mf_GetMatches_Func GetMatches; Mf_Skip_Func Skip; -} IMatchFinder; +} IMatchFinder2; -void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable); +void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable); void MatchFinder_Init_LowHash(CMatchFinder *p); void MatchFinder_Init_HighHash(CMatchFinder *p); -void MatchFinder_Init_3(CMatchFinder *p, int readData); +void MatchFinder_Init_4(CMatchFinder *p); void MatchFinder_Init(CMatchFinder *p); -UInt32 Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); -UInt32 Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); +UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); +UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num); void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num); +void LzFindPrepare(void); + EXTERN_C_END #endif diff --git a/deps/LZMA-SDK/C/LzFindMt.c b/deps/LZMA-SDK/C/LzFindMt.c index cb29a1eac..3865263b7 100644 --- a/deps/LZMA-SDK/C/LzFindMt.c +++ b/deps/LZMA-SDK/C/LzFindMt.c @@ -1,8 +1,10 @@ /* LzFindMt.c -- multithreaded Match finder for LZ algorithms -2021-04-01 : Igor Pavlov : Public domain */ +2021-12-21 : Igor Pavlov : Public domain */ #include "Precomp.h" +// #include + #include "CpuArch.h" #include "LzHash.h" @@ -10,22 +12,34 @@ // #define LOG_ITERS +// #define LOG_THREAD + +#ifdef LOG_THREAD +#include +#define PRF(x) x +#else +#define PRF(x) +#endif + #ifdef LOG_ITERS #include -static UInt64 g_NumIters_Tree; -static UInt64 g_NumIters_Loop; +extern UInt64 g_NumIters_Tree; +extern UInt64 g_NumIters_Loop; +extern UInt64 g_NumIters_Bytes; #define LOG_ITER(x) x #else #define LOG_ITER(x) #endif -#define kMtHashBlockSize (1 << 17) +#define kMtHashBlockSize ((UInt32)1 << 17) #define kMtHashNumBlocks (1 << 1) -#define kMtHashNumBlocksMask (kMtHashNumBlocks - 1) -#define kMtBtBlockSize (1 << 16) +#define GET_HASH_BLOCK_OFFSET(i) (((i) & (kMtHashNumBlocks - 1)) * kMtHashBlockSize) + +#define kMtBtBlockSize ((UInt32)1 << 16) #define kMtBtNumBlocks (1 << 4) -#define kMtBtNumBlocksMask (kMtBtNumBlocks - 1) + +#define GET_BT_BLOCK_OFFSET(i) (((i) & (kMtBtNumBlocks - 1)) * (size_t)kMtBtBlockSize) /* HASH functions: @@ -36,11 +50,17 @@ static UInt64 g_NumIters_Loop; (crc[0...0xFF] & 0xFF) <-> [0...0xFF] */ +#define MF(mt) ((mt)->MatchFinder) +#define MF_CRC (p->crc) + +// #define MF(mt) (&(mt)->MatchFinder) +// #define MF_CRC (p->MatchFinder.crc) + #define MT_HASH2_CALC \ - h2 = (p->crc[cur[0]] ^ cur[1]) & (kHash2Size - 1); + h2 = (MF_CRC[cur[0]] ^ cur[1]) & (kHash2Size - 1); #define MT_HASH3_CALC { \ - UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + UInt32 temp = MF_CRC[cur[0]] ^ cur[1]; \ h2 = temp & (kHash2Size - 1); \ h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); } @@ -59,93 +79,137 @@ static UInt64 g_NumIters_Loop; */ +MY_NO_INLINE static void MtSync_Construct(CMtSync *p) { + p->affinity = 0; p->wasCreated = False; p->csWasInitialized = False; p->csWasEntered = False; Thread_Construct(&p->thread); Event_Construct(&p->canStart); - Event_Construct(&p->wasStarted); Event_Construct(&p->wasStopped); Semaphore_Construct(&p->freeSemaphore); Semaphore_Construct(&p->filledSemaphore); - p->affinity = 0; } +#define DEBUG_BUFFER_LOCK // define it to debug lock state + +#ifdef DEBUG_BUFFER_LOCK +#include +#define BUFFER_MUST_BE_LOCKED(p) if (!(p)->csWasEntered) exit(1); +#define BUFFER_MUST_BE_UNLOCKED(p) if ( (p)->csWasEntered) exit(1); +#else +#define BUFFER_MUST_BE_LOCKED(p) +#define BUFFER_MUST_BE_UNLOCKED(p) +#endif + +#define LOCK_BUFFER(p) { \ + BUFFER_MUST_BE_UNLOCKED(p); \ + CriticalSection_Enter(&(p)->cs); \ + (p)->csWasEntered = True; } + +#define UNLOCK_BUFFER(p) { \ + BUFFER_MUST_BE_LOCKED(p); \ + CriticalSection_Leave(&(p)->cs); \ + (p)->csWasEntered = False; } + + MY_NO_INLINE -static void MtSync_GetNextBlock(CMtSync *p) +static UInt32 MtSync_GetNextBlock(CMtSync *p) { + UInt32 numBlocks = 0; if (p->needStart) { + BUFFER_MUST_BE_UNLOCKED(p) p->numProcessedBlocks = 1; p->needStart = False; p->stopWriting = False; p->exit = False; - Event_Reset(&p->wasStarted); Event_Reset(&p->wasStopped); - Event_Set(&p->canStart); - Event_Wait(&p->wasStarted); - - // if (mt) MatchFinder_Init_LowHash(mt->MatchFinder); } else { - CriticalSection_Leave(&p->cs); - p->csWasEntered = False; - p->numProcessedBlocks++; + UNLOCK_BUFFER(p) + // we free current block + numBlocks = p->numProcessedBlocks++; Semaphore_Release1(&p->freeSemaphore); } + + // buffer is UNLOCKED here Semaphore_Wait(&p->filledSemaphore); - CriticalSection_Enter(&p->cs); - p->csWasEntered = True; + LOCK_BUFFER(p); + return numBlocks; } -/* MtSync_StopWriting must be called if Writing was started */ +/* if Writing (Processing) thread was started, we must call MtSync_StopWriting() */ + +MY_NO_INLINE static void MtSync_StopWriting(CMtSync *p) { - UInt32 myNumBlocks = p->numProcessedBlocks; if (!Thread_WasCreated(&p->thread) || p->needStart) return; - p->stopWriting = True; + + PRF(printf("\nMtSync_StopWriting %p\n", p)); + if (p->csWasEntered) { - CriticalSection_Leave(&p->cs); - p->csWasEntered = False; + /* we don't use buffer in this thread after StopWriting(). + So we UNLOCK buffer. + And we restore default UNLOCKED state for stopped thread */ + UNLOCK_BUFFER(p) } - Semaphore_Release1(&p->freeSemaphore); - + + /* We send (p->stopWriting) message and release freeSemaphore + to free current block. + So the thread will see (p->stopWriting) at some + iteration after Wait(freeSemaphore). + The thread doesn't need to fill all avail free blocks, + so we can get fast thread stop. + */ + + p->stopWriting = True; + Semaphore_Release1(&p->freeSemaphore); // check semaphore count !!! + + PRF(printf("\nMtSync_StopWriting %p : Event_Wait(&p->wasStopped)\n", p)); Event_Wait(&p->wasStopped); + PRF(printf("\nMtSync_StopWriting %p : Event_Wait() finsihed\n", p)); + + /* 21.03 : we don't restore samaphore counters here. + We will recreate and reinit samaphores in next start */ - while (myNumBlocks++ != p->numProcessedBlocks) - { - Semaphore_Wait(&p->filledSemaphore); - Semaphore_Release1(&p->freeSemaphore); - } p->needStart = True; } + +MY_NO_INLINE static void MtSync_Destruct(CMtSync *p) { + PRF(printf("\nMtSync_Destruct %p\n", p)); + if (Thread_WasCreated(&p->thread)) { + /* we want thread to be in Stopped state before sending EXIT command. + note: stop(btSync) will stop (htSync) also */ MtSync_StopWriting(p); + /* thread in Stopped state here : (p->needStart == true) */ p->exit = True; - if (p->needStart) - Event_Set(&p->canStart); - Thread_Wait_Close(&p->thread); + // if (p->needStart) // it's (true) + Event_Set(&p->canStart); // we send EXIT command to thread + Thread_Wait_Close(&p->thread); // we wait thread finishing } + if (p->csWasInitialized) { CriticalSection_Delete(&p->cs); p->csWasInitialized = False; } + p->csWasEntered = False; Event_Close(&p->canStart); - Event_Close(&p->wasStarted); Event_Close(&p->wasStopped); Semaphore_Close(&p->freeSemaphore); Semaphore_Close(&p->filledSemaphore); @@ -153,48 +217,75 @@ static void MtSync_Destruct(CMtSync *p) p->wasCreated = False; } -#define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; } -static SRes MtSync_Create2(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks) +// #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; } +// we want to get real system error codes here instead of SZ_ERROR_THREAD +#define RINOK_THREAD(x) RINOK(x) + + +// call it before each new file (when new starting is required): +MY_NO_INLINE +static SRes MtSync_Init(CMtSync *p, UInt32 numBlocks) +{ + WRes wres; + // BUFFER_MUST_BE_UNLOCKED(p) + if (!p->needStart || p->csWasEntered) + return SZ_ERROR_FAIL; + wres = Semaphore_OptCreateInit(&p->freeSemaphore, numBlocks, numBlocks); + if (wres == 0) + wres = Semaphore_OptCreateInit(&p->filledSemaphore, 0, numBlocks); + return MY_SRes_HRESULT_FROM_WRes(wres); +} + + +static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj) { WRes wres; + if (p->wasCreated) return SZ_OK; RINOK_THREAD(CriticalSection_Init(&p->cs)); p->csWasInitialized = True; + p->csWasEntered = False; RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart)); - RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStarted)); RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped)); - - RINOK_THREAD(Semaphore_Create(&p->freeSemaphore, numBlocks, numBlocks)); - RINOK_THREAD(Semaphore_Create(&p->filledSemaphore, 0, numBlocks)); p->needStart = True; + p->exit = True; /* p->exit is unused before (canStart) Event. + But in case of some unexpected code failure we will get fast exit from thread */ + + // return ERROR_TOO_MANY_POSTS; // for debug + // return EINVAL; // for debug if (p->affinity != 0) wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity); else wres = Thread_Create(&p->thread, startAddress, obj); + RINOK_THREAD(wres); p->wasCreated = True; return SZ_OK; } -static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks) + +MY_NO_INLINE +static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj) { - SRes res = MtSync_Create2(p, startAddress, obj, numBlocks); - if (res != SZ_OK) - MtSync_Destruct(p); - return res; + const WRes wres = MtSync_Create_WRes(p, startAddress, obj); + if (wres == 0) + return 0; + MtSync_Destruct(p); + return MY_SRes_HRESULT_FROM_WRes(wres); } -// static void MtSync_Init(CMtSync *p) { p->needStart = True; } -#define kMtMaxValForNormalize 0xFFFFFFFF -// #define kMtMaxValForNormalize ((1 << 25) + (1 << 20)) +// ---------- HASH THREAD ---------- +#define kMtMaxValForNormalize 0xFFFFFFFF +// #define kMtMaxValForNormalize ((1 << 21)) // for debug +// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses #ifdef MY_CPU_LE_UNALIGN #define GetUi24hi_from32(p) ((UInt32)GetUi32(p) >> 8) @@ -349,27 +440,28 @@ DEF_GetHeads(5b, (crc[p[0]] ^ (crc[p[4]] << kLzHash_CrcShift_1) ^ GetUi24hi_from static void HashThreadFunc(CMatchFinderMt *mt) { CMtSync *p = &mt->hashSync; + PRF(printf("\nHashThreadFunc\n")); + for (;;) { - UInt32 numProcessedBlocks = 0; + UInt32 blockIndex = 0; + PRF(printf("\nHashThreadFunc : Event_Wait(&p->canStart)\n")); Event_Wait(&p->canStart); - Event_Set(&p->wasStarted); + PRF(printf("\nHashThreadFunc : Event_Wait(&p->canStart) : after \n")); + if (p->exit) + { + PRF(printf("\nHashThreadFunc : exit \n")); + return; + } - MatchFinder_Init_HighHash(mt->MatchFinder); + MatchFinder_Init_HighHash(MF(mt)); for (;;) { - if (p->exit) - return; - if (p->stopWriting) - { - p->numProcessedBlocks = numProcessedBlocks; - Event_Set(&p->wasStopped); - break; - } + PRF(printf("Hash thread block = %d pos = %d\n", (unsigned)blockIndex, mt->MatchFinder->pos)); { - CMatchFinder *mf = mt->MatchFinder; + CMatchFinder *mf = MF(mt); if (MatchFinder_NeedMove(mf)) { CriticalSection_Enter(&mt->btSync.cs); @@ -382,196 +474,178 @@ static void HashThreadFunc(CMatchFinderMt *mt) mt->pointerToCurPos -= offset; mt->buffer -= offset; } - CriticalSection_Leave(&mt->btSync.cs); CriticalSection_Leave(&mt->hashSync.cs); + CriticalSection_Leave(&mt->btSync.cs); continue; } Semaphore_Wait(&p->freeSemaphore); + if (p->exit) // exit is unexpected here. But we check it here for some failure case + return; + + // for faster stop : we check (p->stopWriting) after Wait(freeSemaphore) + if (p->stopWriting) + break; + MatchFinder_ReadIfRequired(mf); - if (mf->pos > (kMtMaxValForNormalize - kMtHashBlockSize)) { - UInt32 subValue = (mf->pos - mf->historySize - 1); - MatchFinder_ReduceOffsets(mf, subValue); - MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1); - } - { - UInt32 *heads = mt->hashBuf + ((numProcessedBlocks++) & kMtHashNumBlocksMask) * kMtHashBlockSize; - UInt32 num = mf->streamPos - mf->pos; + UInt32 *heads = mt->hashBuf + GET_HASH_BLOCK_OFFSET(blockIndex++); + UInt32 num = Inline_MatchFinder_GetNumAvailableBytes(mf); heads[0] = 2; heads[1] = num; + + /* heads[1] contains the number of avail bytes: + if (avail < mf->numHashBytes) : + { + it means that stream was finished + HASH_THREAD and BT_TREAD must move position for heads[1] (avail) bytes. + HASH_THREAD doesn't stop, + HASH_THREAD fills only the header (2 numbers) for all next blocks: + {2, NumHashBytes - 1}, {2,0}, {2,0}, ... , {2,0} + } + else + { + HASH_THREAD and BT_TREAD must move position for (heads[0] - 2) bytes; + } + */ + if (num >= mf->numHashBytes) { num = num - mf->numHashBytes + 1; if (num > kMtHashBlockSize - 2) num = kMtHashBlockSize - 2; - mt->GetHeadsFunc(mf->buffer, mf->pos, mf->hash + mf->fixedHashSize, mf->hashMask, heads + 2, num, mf->crc); + + if (mf->pos > (UInt32)kMtMaxValForNormalize - num) + { + const UInt32 subValue = (mf->pos - mf->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1); + Inline_MatchFinder_ReduceOffsets(mf, subValue); + MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1); + } + heads[0] = 2 + num; + mt->GetHeadsFunc(mf->buffer, mf->pos, mf->hash + mf->fixedHashSize, mf->hashMask, heads + 2, num, mf->crc); } - mf->pos += num; + + mf->pos += num; // wrap over zero is allowed at the end of stream mf->buffer += num; } } Semaphore_Release1(&p->filledSemaphore); - } - } -} + } // for() processing end -static void MatchFinderMt_GetNextBlock_Hash(CMatchFinderMt *p) -{ - MtSync_GetNextBlock(&p->hashSync); - p->hashBufPosLimit = p->hashBufPos = ((p->hashSync.numProcessedBlocks - 1) & kMtHashNumBlocksMask) * kMtHashBlockSize; - p->hashBufPosLimit += p->hashBuf[p->hashBufPos++]; - p->hashNumAvail = p->hashBuf[p->hashBufPos++]; + // p->numBlocks_Sent = blockIndex; + Event_Set(&p->wasStopped); + } // for() thread end } -#define kEmptyHashValue 0 + + + +// ---------- BT THREAD ---------- + +/* we use one variable instead of two (cyclicBufferPos == pos) before CyclicBuf wrap. + here we define fixed offset of (p->pos) from (p->cyclicBufferPos) */ +#define CYC_TO_POS_OFFSET 0 +// #define CYC_TO_POS_OFFSET 1 // for debug #define MFMT_GM_INLINE #ifdef MFMT_GM_INLINE /* - we use size_t for _cyclicBufferPos instead of UInt32 + we use size_t for (pos) instead of UInt32 to eliminate "movsx" BUG in old MSVC x64 compiler. */ -MY_NO_INLINE -static UInt32 *GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *cur, CLzRef *son, - size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue, - UInt32 *d, UInt32 _maxLen, const UInt32 *hash, const UInt32 *limit, UInt32 size, UInt32 *posRes) -{ - do - { - UInt32 *_distances = ++d; - UInt32 delta = *hash++; - CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; - CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); - unsigned len0 = 0, len1 = 0; - UInt32 cutValue = _cutValue; - unsigned maxLen = (unsigned)_maxLen; - - /* - #define PREF_STEP 1 - if (size > PREF_STEP) - { - UInt32 delta = hash[PREF_STEP - 1]; - if (delta < _cyclicBufferSize) - { - size_t cyc1 = _cyclicBufferPos + PREF_STEP; - CLzRef *pair = son + ((size_t)(cyc1 - delta + ((delta > cyc1) ? _cyclicBufferSize : 0)) << 1); - Byte b = *(cur + PREF_STEP - delta); - _distances[0] = pair[0]; - _distances[1] = b; - } - } - */ - if (cutValue == 0 || delta >= _cyclicBufferSize) - { - *ptr0 = *ptr1 = kEmptyHashValue; - } - else - for (LOG_ITER(g_NumIters_Tree++);;) - { - LOG_ITER(g_NumIters_Loop++); - { - CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((_cyclicBufferPos < delta) ? _cyclicBufferSize : 0)) << 1); - const Byte *pb = cur - delta; - unsigned len = (len0 < len1 ? len0 : len1); - UInt32 pair0 = *pair; - if (pb[len] == cur[len]) - { - if (++len != lenLimit && pb[len] == cur[len]) - while (++len != lenLimit) - if (pb[len] != cur[len]) - break; - if (maxLen < len) - { - maxLen = len; - *d++ = (UInt32)len; - *d++ = delta - 1; - if (len == lenLimit) - { - UInt32 pair1 = pair[1]; - *ptr1 = pair0; - *ptr0 = pair1; - break; - } - } - } - { - UInt32 curMatch = pos - delta; - // delta = pos - *pair; - // delta = pos - pair[((UInt32)pb[len] - (UInt32)cur[len]) >> 31]; - if (pb[len] < cur[len]) - { - delta = pos - pair[1]; - *ptr1 = curMatch; - ptr1 = pair + 1; - len1 = len; - } - else - { - delta = pos - *pair; - *ptr0 = curMatch; - ptr0 = pair; - len0 = len; - } - } - } - if (--cutValue == 0 || delta >= _cyclicBufferSize) - { - *ptr0 = *ptr1 = kEmptyHashValue; - break; - } - } - pos++; - _cyclicBufferPos++; - cur++; - { - UInt32 num = (UInt32)(d - _distances); - _distances[-1] = num; - } - } - while (d < limit && --size != 0); - *posRes = pos; - return d; -} +UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes); #endif - static void BtGetMatches(CMatchFinderMt *p, UInt32 *d) { UInt32 numProcessed = 0; UInt32 curPos = 2; - UInt32 limit = kMtBtBlockSize - (p->matchMaxLen * 2); // * 2 + /* GetMatchesSpec() functions don't create (len = 1) + in [len, dist] match pairs, if (p->numHashBytes >= 2) + Also we suppose here that (matchMaxLen >= 2). + So the following code for (reserve) is not required + UInt32 reserve = (p->matchMaxLen * 2); + const UInt32 kNumHashBytes_Max = 5; // BT_HASH_BYTES_MAX + if (reserve < kNumHashBytes_Max - 1) + reserve = kNumHashBytes_Max - 1; + const UInt32 limit = kMtBtBlockSize - (reserve); + */ + + const UInt32 limit = kMtBtBlockSize - (p->matchMaxLen * 2); + d[1] = p->hashNumAvail; + + if (p->failure_BT) + { + // printf("\n == 1 BtGetMatches() p->failure_BT\n"); + d[0] = 0; + // d[1] = 0; + return; + } while (curPos < limit) { if (p->hashBufPos == p->hashBufPosLimit) { - MatchFinderMt_GetNextBlock_Hash(p); - d[1] = numProcessed + p->hashNumAvail; - if (p->hashNumAvail >= p->numHashBytes) + // MatchFinderMt_GetNextBlock_Hash(p); + UInt32 avail; + { + const UInt32 bi = MtSync_GetNextBlock(&p->hashSync); + const UInt32 k = GET_HASH_BLOCK_OFFSET(bi); + const UInt32 *h = p->hashBuf + k; + avail = h[1]; + p->hashBufPosLimit = k + h[0]; + p->hashNumAvail = avail; + p->hashBufPos = k + 2; + } + + { + /* we must prevent UInt32 overflow for avail total value, + if avail was increased with new hash block */ + UInt32 availSum = numProcessed + avail; + if (availSum < numProcessed) + availSum = (UInt32)(Int32)-1; + d[1] = availSum; + } + + if (avail >= p->numHashBytes) continue; - d[0] = curPos + p->hashNumAvail; + + // if (p->hashBufPos != p->hashBufPosLimit) exit(1); + + /* (avail < p->numHashBytes) + It means that stream was finished. + And (avail) - is a number of remaining bytes, + we fill (d) for (avail) bytes for LZ_THREAD (receiver). + but we don't update (p->pos) and (p->cyclicBufferPos) here in BT_THREAD */ + + /* here we suppose that we have space enough: + (kMtBtBlockSize - curPos >= p->hashNumAvail) */ + p->hashNumAvail = 0; + d[0] = curPos + avail; d += curPos; - for (; p->hashNumAvail != 0; p->hashNumAvail--) + for (; avail != 0; avail--) *d++ = 0; return; } { UInt32 size = p->hashBufPosLimit - p->hashBufPos; - UInt32 lenLimit = p->matchMaxLen; UInt32 pos = p->pos; UInt32 cyclicBufferPos = p->cyclicBufferPos; + UInt32 lenLimit = p->matchMaxLen; if (lenLimit >= p->hashNumAvail) lenLimit = p->hashNumAvail; { @@ -583,6 +657,14 @@ static void BtGetMatches(CMatchFinderMt *p, UInt32 *d) size = size2; } + if (pos > (UInt32)kMtMaxValForNormalize - size) + { + const UInt32 subValue = (pos - p->cyclicBufferSize); // & ~(UInt32)(kNormalizeAlign - 1); + pos -= subValue; + p->pos = pos; + MatchFinder_Normalize3(subValue, p->son, (size_t)p->cyclicBufferSize * 2); + } + #ifndef MFMT_GM_INLINE while (curPos < limit && size-- != 0) { @@ -598,21 +680,45 @@ static void BtGetMatches(CMatchFinderMt *p, UInt32 *d) } #else { - UInt32 posRes; - curPos = (UInt32)(GetMatchesSpecN(lenLimit, pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue, - d + curPos, p->numHashBytes - 1, p->hashBuf + p->hashBufPos, - d + limit, - size, &posRes) - d); - p->hashBufPos += posRes - pos; - cyclicBufferPos += posRes - pos; - p->buffer += posRes - pos; - pos = posRes; + UInt32 posRes = pos; + const UInt32 *d_end; + { + d_end = GetMatchesSpecN_2( + p->buffer + lenLimit - 1, + pos, p->buffer, p->son, p->cutValue, d + curPos, + p->numHashBytes - 1, p->hashBuf + p->hashBufPos, + d + limit, p->hashBuf + p->hashBufPos + size, + cyclicBufferPos, p->cyclicBufferSize, + &posRes); + } + { + if (!d_end) + { + // printf("\n == 2 BtGetMatches() p->failure_BT\n"); + // internal data failure + p->failure_BT = True; + d[0] = 0; + // d[1] = 0; + return; + } + } + curPos = (UInt32)(d_end - d); + { + const UInt32 processed = posRes - pos; + pos = posRes; + p->hashBufPos += processed; + cyclicBufferPos += processed; + p->buffer += processed; + } } #endif - numProcessed += pos - p->pos; - p->hashNumAvail -= pos - p->pos; - p->pos = pos; + { + const UInt32 processed = pos - p->pos; + numProcessed += processed; + p->hashNumAvail -= processed; + p->pos = pos; + } if (cyclicBufferPos == p->cyclicBufferSize) cyclicBufferPos = 0; p->cyclicBufferPos = cyclicBufferPos; @@ -622,31 +728,28 @@ static void BtGetMatches(CMatchFinderMt *p, UInt32 *d) d[0] = curPos; } + static void BtFillBlock(CMatchFinderMt *p, UInt32 globalBlockIndex) { CMtSync *sync = &p->hashSync; + + BUFFER_MUST_BE_UNLOCKED(sync) + if (!sync->needStart) { - CriticalSection_Enter(&sync->cs); - sync->csWasEntered = True; + LOCK_BUFFER(sync) } - BtGetMatches(p, p->btBuf + (globalBlockIndex & kMtBtNumBlocksMask) * kMtBtBlockSize); - - if (p->pos > kMtMaxValForNormalize - kMtBtBlockSize) - { - UInt32 subValue = p->pos - p->cyclicBufferSize; - MatchFinder_Normalize3(subValue, p->son, (size_t)p->cyclicBufferSize * 2); - p->pos -= subValue; - } + BtGetMatches(p, p->btBuf + GET_BT_BLOCK_OFFSET(globalBlockIndex)); + + /* We suppose that we have called GetNextBlock() from start. + So buffer is LOCKED */ - if (!sync->needStart) - { - CriticalSection_Leave(&sync->cs); - sync->csWasEntered = False; - } + UNLOCK_BUFFER(sync) } + +MY_NO_INLINE static void BtThreadFunc(CMatchFinderMt *mt) { CMtSync *p = &mt->btSync; @@ -654,25 +757,35 @@ static void BtThreadFunc(CMatchFinderMt *mt) { UInt32 blockIndex = 0; Event_Wait(&p->canStart); - Event_Set(&p->wasStarted); + for (;;) { + PRF(printf(" BT thread block = %d pos = %d\n", (unsigned)blockIndex, mt->pos)); + /* (p->exit == true) is possible after (p->canStart) at first loop iteration + and is unexpected after more Wait(freeSemaphore) iterations */ if (p->exit) return; + + Semaphore_Wait(&p->freeSemaphore); + + // for faster stop : we check (p->stopWriting) after Wait(freeSemaphore) if (p->stopWriting) - { - p->numProcessedBlocks = blockIndex; - MtSync_StopWriting(&mt->hashSync); - Event_Set(&p->wasStopped); break; - } - Semaphore_Wait(&p->freeSemaphore); + BtFillBlock(mt, blockIndex++); + Semaphore_Release1(&p->filledSemaphore); } + + // we stop HASH_THREAD here + MtSync_StopWriting(&mt->hashSync); + + // p->numBlocks_Sent = blockIndex; + Event_Set(&p->wasStopped); } } + void MatchFinderMt_Construct(CMatchFinderMt *p) { p->hashBuf = NULL; @@ -688,24 +801,39 @@ static void MatchFinderMt_FreeMem(CMatchFinderMt *p, ISzAllocPtr alloc) void MatchFinderMt_Destruct(CMatchFinderMt *p, ISzAllocPtr alloc) { - MtSync_Destruct(&p->hashSync); + /* + HASH_THREAD can use CriticalSection(s) btSync.cs and hashSync.cs. + So we must be sure that HASH_THREAD will not use CriticalSection(s) + after deleting CriticalSection here. + + we call ReleaseStream(p) + that calls StopWriting(btSync) + that calls StopWriting(hashSync), if it's required to stop HASH_THREAD. + after StopWriting() it's safe to destruct MtSync(s) in any order */ + + MatchFinderMt_ReleaseStream(p); + MtSync_Destruct(&p->btSync); + MtSync_Destruct(&p->hashSync); LOG_ITER( - printf("\nTree %9d * %7d iter = %9d sum \n", + printf("\nTree %9d * %7d iter = %9d = sum : bytes = %9d\n", (UInt32)(g_NumIters_Tree / 1000), (UInt32)(((UInt64)g_NumIters_Loop * 1000) / (g_NumIters_Tree + 1)), - (UInt32)(g_NumIters_Loop / 1000) + (UInt32)(g_NumIters_Loop / 1000), + (UInt32)(g_NumIters_Bytes / 1000) )); MatchFinderMt_FreeMem(p, alloc); } + #define kHashBufferSize (kMtHashBlockSize * kMtHashNumBlocks) #define kBtBufferSize (kMtBtBlockSize * kMtBtNumBlocks) -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE HashThreadFunc2(void *p) { HashThreadFunc((CMatchFinderMt *)p); return 0; } -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE BtThreadFunc2(void *p) + +static THREAD_FUNC_DECL HashThreadFunc2(void *p) { HashThreadFunc((CMatchFinderMt *)p); return 0; } +static THREAD_FUNC_DECL BtThreadFunc2(void *p) { Byte allocaDummy[0x180]; unsigned i = 0; @@ -716,16 +844,17 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE BtThreadFunc2(void *p) return 0; } + SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc) { - CMatchFinder *mf = p->MatchFinder; + CMatchFinder *mf = MF(p); p->historySize = historySize; if (kMtBtBlockSize <= matchMaxLen * 4) return SZ_ERROR_PARAM; if (!p->hashBuf) { - p->hashBuf = (UInt32 *)ISzAlloc_Alloc(alloc, (kHashBufferSize + kBtBufferSize) * sizeof(UInt32)); + p->hashBuf = (UInt32 *)ISzAlloc_Alloc(alloc, ((size_t)kHashBufferSize + (size_t)kBtBufferSize) * sizeof(UInt32)); if (!p->hashBuf) return SZ_ERROR_MEM; p->btBuf = p->hashBuf + kHashBufferSize; @@ -735,101 +864,163 @@ SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddB if (!MatchFinder_Create(mf, historySize, keepAddBufferBefore, matchMaxLen, keepAddBufferAfter, alloc)) return SZ_ERROR_MEM; - RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p, kMtHashNumBlocks)); - RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p, kMtBtNumBlocks)); + RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p)); + RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p)); return SZ_OK; } -/* Call it after ReleaseStream / SetStream */ + +SRes MatchFinderMt_InitMt(CMatchFinderMt *p) +{ + RINOK(MtSync_Init(&p->hashSync, kMtHashNumBlocks)); + return MtSync_Init(&p->btSync, kMtBtNumBlocks); +} + + static void MatchFinderMt_Init(CMatchFinderMt *p) { - CMatchFinder *mf = p->MatchFinder; + CMatchFinder *mf = MF(p); p->btBufPos = - p->btBufPosLimit = 0; + p->btBufPosLimit = NULL; p->hashBufPos = p->hashBufPosLimit = 0; + p->hashNumAvail = 0; // 21.03 + + p->failure_BT = False; /* Init without data reading. We don't want to read data in this thread */ - MatchFinder_Init_3(mf, False); + MatchFinder_Init_4(mf); + MatchFinder_Init_LowHash(mf); p->pointerToCurPos = Inline_MatchFinder_GetPointerToCurrentPos(mf); p->btNumAvailBytes = 0; - p->lzPos = p->historySize + 1; + p->failure_LZ_BT = False; + // p->failure_LZ_LZ = False; + + p->lzPos = + 1; // optimal smallest value + // 0; // for debug: ignores match to start + // kNormalizeAlign; // for debug p->hash = mf->hash; p->fixedHashSize = mf->fixedHashSize; // p->hash4Mask = mf->hash4Mask; p->crc = mf->crc; + // memcpy(p->crc, mf->crc, sizeof(mf->crc)); p->son = mf->son; p->matchMaxLen = mf->matchMaxLen; p->numHashBytes = mf->numHashBytes; - p->pos = mf->pos; - p->buffer = mf->buffer; - p->cyclicBufferPos = mf->cyclicBufferPos; + + /* (mf->pos) and (mf->streamPos) were already initialized to 1 in MatchFinder_Init_4() */ + // mf->streamPos = mf->pos = 1; // optimal smallest value + // 0; // for debug: ignores match to start + // kNormalizeAlign; // for debug + + /* we must init (p->pos = mf->pos) for BT, because + BT code needs (p->pos == delta_value_for_empty_hash_record == mf->pos) */ + p->pos = mf->pos; // do not change it + + p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); p->cyclicBufferSize = mf->cyclicBufferSize; + p->buffer = mf->buffer; p->cutValue = mf->cutValue; + // p->son[0] = p->son[1] = 0; // unused: to init skipped record for speculated accesses. } + /* ReleaseStream is required to finish multithreading */ void MatchFinderMt_ReleaseStream(CMatchFinderMt *p) { + // Sleep(1); // for debug MtSync_StopWriting(&p->btSync); + // Sleep(200); // for debug /* p->MatchFinder->ReleaseStream(); */ } MY_NO_INLINE -static void MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p) +static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p) { - UInt32 blockIndex, k; - - MtSync_GetNextBlock(&p->btSync); - - blockIndex = ((p->btSync.numProcessedBlocks - 1) & kMtBtNumBlocksMask); - k = blockIndex * kMtBtBlockSize; - p->btBufPosLimit = k + p->btBuf[k]; - p->btNumAvailBytes = p->btBuf[k + 1]; - p->btBufPos = k + 2; - if (p->lzPos >= kMtMaxValForNormalize - kMtBtBlockSize) + if (p->failure_LZ_BT) + p->btBufPos = p->failureBuf; + else { - MatchFinder_Normalize3(p->lzPos - p->historySize - 1, p->hash, p->fixedHashSize); - p->lzPos = p->historySize + 1; + const UInt32 bi = MtSync_GetNextBlock(&p->btSync); + const UInt32 *bt = p->btBuf + GET_BT_BLOCK_OFFSET(bi); + { + const UInt32 numItems = bt[0]; + p->btBufPosLimit = bt + numItems; + p->btNumAvailBytes = bt[1]; + p->btBufPos = bt + 2; + if (numItems < 2 || numItems > kMtBtBlockSize) + { + p->failureBuf[0] = 0; + p->btBufPos = p->failureBuf; + p->btBufPosLimit = p->failureBuf + 1; + p->failure_LZ_BT = True; + // p->btNumAvailBytes = 0; + /* we don't want to decrease AvailBytes, that was load before. + that can be unxepected for the code that have loaded anopther value before */ + } + } + + if (p->lzPos >= (UInt32)kMtMaxValForNormalize - (UInt32)kMtBtBlockSize) + { + /* we don't check (lzPos) over exact avail bytes in (btBuf). + (fixedHashSize) is small, so normalization is fast */ + const UInt32 subValue = (p->lzPos - p->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1); + p->lzPos -= subValue; + MatchFinder_Normalize3(subValue, p->hash, p->fixedHashSize); + } } + return p->btNumAvailBytes; } + + static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p) { return p->pointerToCurPos; } + #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p); + static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p) { - GET_NEXT_BLOCK_IF_REQUIRED; - return p->btNumAvailBytes; + if (p->btBufPos != p->btBufPosLimit) + return p->btNumAvailBytes; + return MatchFinderMt_GetNextBlock_Bt(p); } + +// #define CHECK_FAILURE_LZ(_match_, _pos_) if (_match_ >= _pos_) { p->failure_LZ_LZ = True; return d; } +#define CHECK_FAILURE_LZ(_match_, _pos_) + static UInt32 * MixMatches2(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) { UInt32 h2, c2; UInt32 *hash = p->hash; const Byte *cur = p->pointerToCurPos; - UInt32 m = p->lzPos; + const UInt32 m = p->lzPos; MT_HASH2_CALC c2 = hash[h2]; hash[h2] = m; if (c2 >= matchMinPos) + { + CHECK_FAILURE_LZ(c2, m) if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) { *d++ = 2; *d++ = m - c2 - 1; } + } return d; } @@ -839,7 +1030,7 @@ static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) UInt32 h2, h3, c2, c3; UInt32 *hash = p->hash; const Byte *cur = p->pointerToCurPos; - UInt32 m = p->lzPos; + const UInt32 m = p->lzPos; MT_HASH3_CALC c2 = hash[h2]; @@ -848,22 +1039,30 @@ static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) hash[h2] = m; (hash + kFix3HashSize)[h3] = m; - if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) + if (c2 >= matchMinPos) { - d[1] = m - c2 - 1; - if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2]) + CHECK_FAILURE_LZ(c2, m) + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) { - d[0] = 3; - return d + 2; + d[1] = m - c2 - 1; + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2]) + { + d[0] = 3; + return d + 2; + } + d[0] = 2; + d += 2; } - d[0] = 2; - d += 2; } - if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0]) + if (c3 >= matchMinPos) { - *d++ = 3; - *d++ = m - c3 - 1; + CHECK_FAILURE_LZ(c3, m) + if (cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0]) + { + *d++ = 3; + *d++ = m - c3 - 1; + } } return d; @@ -874,30 +1073,37 @@ static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) /* static -UInt32 MatchFinderMt_GetMatches_Bt4(CMatchFinderMt *p, UInt32 *d) +UInt32* MatchFinderMt_GetMatches_Bt4(CMatchFinderMt *p, UInt32 *d) { - UInt32 pos = p->btBufPos; - const UInt32 *bt = p->btBuf + pos; - UInt32 len = *bt++; + const UInt32 *bt = p->btBufPos; + const UInt32 len = *bt++; + const UInt32 *btLim = bt + len; UInt32 matchMinPos; - const UInt32 *d_base = d; UInt32 avail = p->btNumAvailBytes - 1; - p->btBufPos = pos + 1 + len; + p->btBufPos = btLim; { - UInt32 temp1 = p->historySize; p->btNumAvailBytes = avail; #define BT_HASH_BYTES_MAX 5 + + matchMinPos = p->lzPos; if (len != 0) - temp1 = bt[1]; - else if (avail < (BT_HASH_BYTES_MAX - 2)) + matchMinPos -= bt[1]; + else if (avail < (BT_HASH_BYTES_MAX - 1) - 1) { INCREASE_LZ_POS - return 0; + return d; + } + else + { + const UInt32 hs = p->historySize; + if (matchMinPos > hs) + matchMinPos -= hs; + else + matchMinPos = 1; } - matchMinPos = p->lzPos - temp1; } for (;;) @@ -942,17 +1148,17 @@ UInt32 MatchFinderMt_GetMatches_Bt4(CMatchFinderMt *p, UInt32 *d) { do { - UInt32 v0 = bt[0]; - UInt32 v1 = bt[1]; + const UInt32 v0 = bt[0]; + const UInt32 v1 = bt[1]; bt += 2; d[0] = v0; d[1] = v1; d += 2; } - while ((len -= 2) != 0); + while (bt != btLim); } INCREASE_LZ_POS - return (UInt32)(d - d_base); + return d; } */ @@ -962,7 +1168,7 @@ static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) UInt32 h2, h3, /* h4, */ c2, c3 /* , c4 */; UInt32 *hash = p->hash; const Byte *cur = p->pointerToCurPos; - UInt32 m = p->lzPos; + const UInt32 m = p->lzPos; MT_HASH3_CALC // MT_HASH4_CALC c2 = hash[h2]; @@ -1038,43 +1244,49 @@ static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) } -static UInt32 MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d) +static UInt32* MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d) { - const UInt32 *bt = p->btBuf + p->btBufPos; - UInt32 len = *bt++; - p->btBufPos += 1 + len; + const UInt32 *bt = p->btBufPos; + const UInt32 len = *bt++; + const UInt32 *btLim = bt + len; + p->btBufPos = btLim; p->btNumAvailBytes--; + INCREASE_LZ_POS { - UInt32 i; - for (i = 0; i < len; i += 2) + while (bt != btLim) { - UInt32 v0 = bt[0]; - UInt32 v1 = bt[1]; + const UInt32 v0 = bt[0]; + const UInt32 v1 = bt[1]; bt += 2; d[0] = v0; d[1] = v1; d += 2; } } - INCREASE_LZ_POS - return len; + return d; } -static UInt32 MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) +static UInt32* MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) { - UInt32 pos = p->btBufPos; - const UInt32 *bt = p->btBuf + pos; + const UInt32 *bt = p->btBufPos; UInt32 len = *bt++; - UInt32 avail = p->btNumAvailBytes - 1; + const UInt32 avail = p->btNumAvailBytes - 1; p->btNumAvailBytes = avail; - p->btBufPos = pos + 1 + len; + p->btBufPos = bt + len; if (len == 0) { #define BT_HASH_BYTES_MAX 5 if (avail >= (BT_HASH_BYTES_MAX - 1) - 1) - len = (UInt32)(p->MixMatchesFunc(p, p->lzPos - p->historySize, d) - d); + { + UInt32 m = p->lzPos; + if (m > p->historySize) + m -= p->historySize; + else + m = 1; + d = p->MixMatchesFunc(p, m, d); + } } else { @@ -1083,27 +1295,26 @@ static UInt32 MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) (match_len >= numHashBytes). MixMatchesFunc() inserts only hash matches that are nearer than (match_dist) */ - UInt32 *d2; - d2 = p->MixMatchesFunc(p, p->lzPos - bt[1], d); + d = p->MixMatchesFunc(p, p->lzPos - bt[1], d); + // if (d) // check for failure do { - UInt32 v0 = bt[0]; - UInt32 v1 = bt[1]; + const UInt32 v0 = bt[0]; + const UInt32 v1 = bt[1]; bt += 2; - d2[0] = v0; - d2[1] = v1; - d2 += 2; + d[0] = v0; + d[1] = v1; + d += 2; } - while ((len -= 2) != 0); - len = (UInt32)(d2 - d); + while (len -= 2); } INCREASE_LZ_POS - return len; + return d; } #define SKIP_HEADER2_MT do { GET_NEXT_BLOCK_IF_REQUIRED #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash; -#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += p->btBuf[p->btBufPos] + 1; } while (--num != 0); +#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0); static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num) { @@ -1131,10 +1342,14 @@ static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num) SKIP_FOOTER_MT } +/* +// MatchFinderMt4_Skip() is similar to MatchFinderMt3_Skip(). +// The difference is that MatchFinderMt3_Skip() updates hash for last 3 bytes of stream. + static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num) { SKIP_HEADER_MT(4) - UInt32 h2, h3 /*, h4 */; + UInt32 h2, h3; // h4 MT_HASH3_CALC // MT_HASH4_CALC // (hash + kFix4HashSize)[h4] = @@ -1143,15 +1358,16 @@ static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num) p->lzPos; SKIP_FOOTER_MT } +*/ -void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable) +void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable) { vTable->Init = (Mf_Init_Func)MatchFinderMt_Init; vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes; vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos; vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches; - switch (p->MatchFinder->numHashBytes) + switch (MF(p)->numHashBytes) { case 2: p->GetHeadsFunc = GetHeads2; @@ -1160,12 +1376,12 @@ void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable) vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches; break; case 3: - p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads3b : GetHeads3; + p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3; p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2; vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip; break; case 4: - p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads4b : GetHeads4; + p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4; // it's fast inline version of GetMatches() // vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches_Bt4; @@ -1174,9 +1390,11 @@ void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable) vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip; break; default: - p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads5b : GetHeads5; + p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5; p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4; - vTable->Skip = (Mf_Skip_Func)MatchFinderMt4_Skip; + vTable->Skip = + (Mf_Skip_Func)MatchFinderMt3_Skip; + // (Mf_Skip_Func)MatchFinderMt4_Skip; break; } } diff --git a/deps/LZMA-SDK/C/LzFindMt.h b/deps/LZMA-SDK/C/LzFindMt.h index 888c787cb..ee9a1b6fe 100644 --- a/deps/LZMA-SDK/C/LzFindMt.h +++ b/deps/LZMA-SDK/C/LzFindMt.h @@ -1,5 +1,5 @@ /* LzFindMt.h -- multithreaded Match finder for LZ algorithms -2019-11-05 : Igor Pavlov : Public domain */ +2021-07-12 : Igor Pavlov : Public domain */ #ifndef __LZ_FIND_MT_H #define __LZ_FIND_MT_H @@ -11,22 +11,24 @@ EXTERN_C_BEGIN typedef struct _CMtSync { + UInt32 numProcessedBlocks; + CThread thread; + UInt64 affinity; + BoolInt wasCreated; BoolInt needStart; + BoolInt csWasInitialized; + BoolInt csWasEntered; + BoolInt exit; BoolInt stopWriting; - CThread thread; CAutoResetEvent canStart; - CAutoResetEvent wasStarted; CAutoResetEvent wasStopped; CSemaphore freeSemaphore; CSemaphore filledSemaphore; - BoolInt csWasInitialized; - BoolInt csWasEntered; CCriticalSection cs; - UInt32 numProcessedBlocks; - UInt64 affinity; + // UInt32 numBlocks_Sent; } CMtSync; typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distances); @@ -42,8 +44,8 @@ typedef struct _CMatchFinderMt /* LZ */ const Byte *pointerToCurPos; UInt32 *btBuf; - UInt32 btBufPos; - UInt32 btBufPosLimit; + const UInt32 *btBufPos; + const UInt32 *btBufPosLimit; UInt32 lzPos; UInt32 btNumAvailBytes; @@ -54,6 +56,10 @@ typedef struct _CMatchFinderMt const UInt32 *crc; Mf_Mix_Matches MixMatchesFunc; + UInt32 failure_LZ_BT; // failure in BT transfered to LZ + // UInt32 failure_LZ_LZ; // failure in LZ tables + UInt32 failureBuf[1]; + // UInt32 crc[256]; /* LZ + BT */ CMtSync btSync; @@ -64,6 +70,8 @@ typedef struct _CMatchFinderMt UInt32 hashBufPos; UInt32 hashBufPosLimit; UInt32 hashNumAvail; + UInt32 failure_BT; + CLzRef *son; UInt32 matchMaxLen; @@ -71,7 +79,7 @@ typedef struct _CMatchFinderMt UInt32 pos; const Byte *buffer; UInt32 cyclicBufferPos; - UInt32 cyclicBufferSize; /* it must be historySize + 1 */ + UInt32 cyclicBufferSize; /* it must be = (historySize + 1) */ UInt32 cutValue; /* BT + Hash */ @@ -81,13 +89,19 @@ typedef struct _CMatchFinderMt /* Hash */ Mf_GetHeads GetHeadsFunc; CMatchFinder *MatchFinder; + // CMatchFinder MatchFinder; } CMatchFinderMt; +// only for Mt part void MatchFinderMt_Construct(CMatchFinderMt *p); void MatchFinderMt_Destruct(CMatchFinderMt *p, ISzAllocPtr alloc); + SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc); -void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable); +void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable); + +/* call MatchFinderMt_InitMt() before IMatchFinder::Init() */ +SRes MatchFinderMt_InitMt(CMatchFinderMt *p); void MatchFinderMt_ReleaseStream(CMatchFinderMt *p); EXTERN_C_END diff --git a/deps/LZMA-SDK/C/LzFindOpt.c b/deps/LZMA-SDK/C/LzFindOpt.c new file mode 100644 index 000000000..dbb6ad9fc --- /dev/null +++ b/deps/LZMA-SDK/C/LzFindOpt.c @@ -0,0 +1,578 @@ +/* LzFindOpt.c -- multithreaded Match finder for LZ algorithms +2021-07-13 : Igor Pavlov : Public domain */ + +#include "Precomp.h" + +#include "CpuArch.h" +#include "LzFind.h" + +// #include "LzFindMt.h" + +// #define LOG_ITERS + +// #define LOG_THREAD + +#ifdef LOG_THREAD +#include +#define PRF(x) x +#else +// #define PRF(x) +#endif + +#ifdef LOG_ITERS +#include +UInt64 g_NumIters_Tree; +UInt64 g_NumIters_Loop; +UInt64 g_NumIters_Bytes; +#define LOG_ITER(x) x +#else +#define LOG_ITER(x) +#endif + +// ---------- BT THREAD ---------- + +#define USE_SON_PREFETCH +#define USE_LONG_MATCH_OPT + +#define kEmptyHashValue 0 + +// #define CYC_TO_POS_OFFSET 0 + +// #define CYC_TO_POS_OFFSET 1 // for debug + +/* +MY_NO_INLINE +UInt32 * MY_FAST_CALL GetMatchesSpecN_1(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 *posRes) +{ + do + { + UInt32 delta; + if (hash == size) + break; + delta = *hash++; + + if (delta == 0 || delta > (UInt32)pos) + return NULL; + + lenLimit++; + + if (delta == (UInt32)pos) + { + CLzRef *ptr1 = son + ((size_t)pos << 1) - CYC_TO_POS_OFFSET * 2; + *d++ = 0; + ptr1[0] = kEmptyHashValue; + ptr1[1] = kEmptyHashValue; + } +else +{ + UInt32 *_distances = ++d; + + CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1; + CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; + + const Byte *len0 = cur, *len1 = cur; + UInt32 cutValue = _cutValue; + const Byte *maxLen = cur + _maxLen; + + for (LOG_ITER(g_NumIters_Tree++);;) + { + LOG_ITER(g_NumIters_Loop++); + { + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + CLzRef *pair = son + ((size_t)(((ptrdiff_t)pos - CYC_TO_POS_OFFSET) + diff) << 1); + const Byte *len = (len0 < len1 ? len0 : len1); + + #ifdef USE_SON_PREFETCH + const UInt32 pair0 = *pair; + #endif + + if (len[diff] == len[0]) + { + if (++len != lenLimit && len[diff] == len[0]) + while (++len != lenLimit) + { + LOG_ITER(g_NumIters_Bytes++); + if (len[diff] != len[0]) + break; + } + if (maxLen < len) + { + maxLen = len; + *d++ = (UInt32)(len - cur); + *d++ = delta - 1; + + if (len == lenLimit) + { + const UInt32 pair1 = pair[1]; + *ptr1 = + #ifdef USE_SON_PREFETCH + pair0; + #else + pair[0]; + #endif + *ptr0 = pair1; + + _distances[-1] = (UInt32)(d - _distances); + + #ifdef USE_LONG_MATCH_OPT + + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + + { + for (;;) + { + hash++; + pos++; + cur++; + lenLimit++; + { + CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; + #if 0 + *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff]; + #else + const UInt32 p0 = ptr[0 + (diff * 2)]; + const UInt32 p1 = ptr[1 + (diff * 2)]; + ptr[0] = p0; + ptr[1] = p1; + // ptr[0] = ptr[0 + (diff * 2)]; + // ptr[1] = ptr[1 + (diff * 2)]; + #endif + } + // PrintSon(son + 2, pos - 1); + // printf("\npos = %x delta = %x\n", pos, delta); + len++; + *d++ = 2; + *d++ = (UInt32)(len - cur); + *d++ = delta - 1; + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + } + } + #endif + + break; + } + } + } + + { + const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff); + if (len[diff] < len[0]) + { + delta = pair[1]; + if (delta >= curMatch) + return NULL; + *ptr1 = curMatch; + ptr1 = pair + 1; + len1 = len; + } + else + { + delta = *pair; + if (delta >= curMatch) + return NULL; + *ptr0 = curMatch; + ptr0 = pair; + len0 = len; + } + + delta = (UInt32)pos - delta; + + if (--cutValue == 0 || delta >= pos) + { + *ptr0 = *ptr1 = kEmptyHashValue; + _distances[-1] = (UInt32)(d - _distances); + break; + } + } + } + } // for (tree iterations) +} + pos++; + cur++; + } + while (d < limit); + *posRes = (UInt32)pos; + return d; +} +*/ + +/* define cbs if you use 2 functions. + GetMatchesSpecN_1() : (pos < _cyclicBufferSize) + GetMatchesSpecN_2() : (pos >= _cyclicBufferSize) + + do not define cbs if you use 1 function: + GetMatchesSpecN_2() +*/ + +// #define cbs _cyclicBufferSize + +/* + we use size_t for (pos) and (_cyclicBufferPos_ instead of UInt32 + to eliminate "movsx" BUG in old MSVC x64 compiler. +*/ + +UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes); + +MY_NO_INLINE +UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes) +{ + do // while (hash != size) + { + UInt32 delta; + + #ifndef cbs + UInt32 cbs; + #endif + + if (hash == size) + break; + + delta = *hash++; + + if (delta == 0) + return NULL; + + lenLimit++; + + #ifndef cbs + cbs = _cyclicBufferSize; + if ((UInt32)pos < cbs) + { + if (delta > (UInt32)pos) + return NULL; + cbs = (UInt32)pos; + } + #endif + + if (delta >= cbs) + { + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + *d++ = 0; + ptr1[0] = kEmptyHashValue; + ptr1[1] = kEmptyHashValue; + } +else +{ + UInt32 *_distances = ++d; + + CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + + UInt32 cutValue = _cutValue; + const Byte *len0 = cur, *len1 = cur; + const Byte *maxLen = cur + _maxLen; + + // if (cutValue == 0) { *ptr0 = *ptr1 = kEmptyHashValue; } else + for (LOG_ITER(g_NumIters_Tree++);;) + { + LOG_ITER(g_NumIters_Loop++); + { + // SPEC code + CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - (ptrdiff_t)delta + + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0) + ) << 1); + + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + const Byte *len = (len0 < len1 ? len0 : len1); + + #ifdef USE_SON_PREFETCH + const UInt32 pair0 = *pair; + #endif + + if (len[diff] == len[0]) + { + if (++len != lenLimit && len[diff] == len[0]) + while (++len != lenLimit) + { + LOG_ITER(g_NumIters_Bytes++); + if (len[diff] != len[0]) + break; + } + if (maxLen < len) + { + maxLen = len; + *d++ = (UInt32)(len - cur); + *d++ = delta - 1; + + if (len == lenLimit) + { + const UInt32 pair1 = pair[1]; + *ptr1 = + #ifdef USE_SON_PREFETCH + pair0; + #else + pair[0]; + #endif + *ptr0 = pair1; + + _distances[-1] = (UInt32)(d - _distances); + + #ifdef USE_LONG_MATCH_OPT + + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + + { + for (;;) + { + *d++ = 2; + *d++ = (UInt32)(lenLimit - cur); + *d++ = delta - 1; + cur++; + lenLimit++; + // SPEC + _cyclicBufferPos++; + { + // SPEC code + CLzRef *dest = son + ((size_t)(_cyclicBufferPos) << 1); + const CLzRef *src = dest + ((diff + + (ptrdiff_t)(UInt32)((_cyclicBufferPos < delta) ? cbs : 0)) << 1); + // CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; + #if 0 + *(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src); + #else + const UInt32 p0 = src[0]; + const UInt32 p1 = src[1]; + dest[0] = p0; + dest[1] = p1; + #endif + } + pos++; + hash++; + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + } // for() end for long matches + } + #endif + + break; // break from TREE iterations + } + } + } + { + const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff); + if (len[diff] < len[0]) + { + delta = pair[1]; + *ptr1 = curMatch; + ptr1 = pair + 1; + len1 = len; + if (delta >= curMatch) + return NULL; + } + else + { + delta = *pair; + *ptr0 = curMatch; + ptr0 = pair; + len0 = len; + if (delta >= curMatch) + return NULL; + } + delta = (UInt32)pos - delta; + + if (--cutValue == 0 || delta >= cbs) + { + *ptr0 = *ptr1 = kEmptyHashValue; + _distances[-1] = (UInt32)(d - _distances); + break; + } + } + } + } // for (tree iterations) +} + pos++; + _cyclicBufferPos++; + cur++; + } + while (d < limit); + *posRes = (UInt32)pos; + return d; +} + + + +/* +typedef UInt32 uint32plus; // size_t + +UInt32 * MY_FAST_CALL GetMatchesSpecN_3(uint32plus lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, uint32plus _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes) +{ + do // while (hash != size) + { + UInt32 delta; + + #ifndef cbs + UInt32 cbs; + #endif + + if (hash == size) + break; + + delta = *hash++; + + if (delta == 0) + return NULL; + + #ifndef cbs + cbs = _cyclicBufferSize; + if ((UInt32)pos < cbs) + { + if (delta > (UInt32)pos) + return NULL; + cbs = (UInt32)pos; + } + #endif + + if (delta >= cbs) + { + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + *d++ = 0; + ptr1[0] = kEmptyHashValue; + ptr1[1] = kEmptyHashValue; + } +else +{ + CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + UInt32 *_distances = ++d; + uint32plus len0 = 0, len1 = 0; + UInt32 cutValue = _cutValue; + uint32plus maxLen = _maxLen; + // lenLimit++; // const Byte *lenLimit = cur + _lenLimit; + + for (LOG_ITER(g_NumIters_Tree++);;) + { + LOG_ITER(g_NumIters_Loop++); + { + // const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - delta + + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0) + ) << 1); + const Byte *pb = cur - delta; + uint32plus len = (len0 < len1 ? len0 : len1); + + #ifdef USE_SON_PREFETCH + const UInt32 pair0 = *pair; + #endif + + if (pb[len] == cur[len]) + { + if (++len != lenLimit && pb[len] == cur[len]) + while (++len != lenLimit) + if (pb[len] != cur[len]) + break; + if (maxLen < len) + { + maxLen = len; + *d++ = (UInt32)len; + *d++ = delta - 1; + if (len == lenLimit) + { + { + const UInt32 pair1 = pair[1]; + *ptr0 = pair1; + *ptr1 = + #ifdef USE_SON_PREFETCH + pair0; + #else + pair[0]; + #endif + } + + _distances[-1] = (UInt32)(d - _distances); + + #ifdef USE_LONG_MATCH_OPT + + if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit) + break; + + { + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + for (;;) + { + *d++ = 2; + *d++ = (UInt32)lenLimit; + *d++ = delta - 1; + _cyclicBufferPos++; + { + CLzRef *dest = son + ((size_t)_cyclicBufferPos << 1); + const CLzRef *src = dest + ((diff + + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)) << 1); + #if 0 + *(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src); + #else + const UInt32 p0 = src[0]; + const UInt32 p1 = src[1]; + dest[0] = p0; + dest[1] = p1; + #endif + } + hash++; + pos++; + cur++; + pb++; + if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit) + break; + } + } + #endif + + break; + } + } + } + { + const UInt32 curMatch = (UInt32)pos - delta; + if (pb[len] < cur[len]) + { + delta = pair[1]; + *ptr1 = curMatch; + ptr1 = pair + 1; + len1 = len; + } + else + { + delta = *pair; + *ptr0 = curMatch; + ptr0 = pair; + len0 = len; + } + + { + if (delta >= curMatch) + return NULL; + delta = (UInt32)pos - delta; + if (delta >= cbs + // delta >= _cyclicBufferSize || delta >= pos + || --cutValue == 0) + { + *ptr0 = *ptr1 = kEmptyHashValue; + _distances[-1] = (UInt32)(d - _distances); + break; + } + } + } + } + } // for (tree iterations) +} + pos++; + _cyclicBufferPos++; + cur++; + } + while (d < limit); + *posRes = (UInt32)pos; + return d; +} +*/ diff --git a/deps/LZMA-SDK/C/LzmaEnc.c b/deps/LZMA-SDK/C/LzmaEnc.c index 86dcb1963..ca9154aef 100644 --- a/deps/LZMA-SDK/C/LzmaEnc.c +++ b/deps/LZMA-SDK/C/LzmaEnc.c @@ -1,5 +1,5 @@ /* LzmaEnc.c -- LZMA Encoder -2021-04-01: Igor Pavlov : Public domain */ +2022-07-15: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -12,6 +12,7 @@ #include #endif +#include "CpuArch.h" #include "LzmaEnc.h" #include "LzFind.h" @@ -36,8 +37,8 @@ void LzmaEnc_RestoreState(CLzmaEncHandle pp); static unsigned g_STAT_OFFSET = 0; #endif -#define kLzmaMaxHistorySize ((UInt32)3 << 29) -/* #define kLzmaMaxHistorySize ((UInt32)7 << 29) */ +/* for good normalization speed we still reserve 256 MB before 4 GB range */ +#define kLzmaMaxHistorySize ((UInt32)15 << 28) #define kNumTopBits 24 #define kTopValue ((UInt32)1 << kNumTopBits) @@ -78,13 +79,12 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) if (p->dictSize > p->reduceSize) { - unsigned i; - UInt32 reduceSize = (UInt32)p->reduceSize; - for (i = 11; i <= 30; i++) - { - if (reduceSize <= ((UInt32)2 << i)) { p->dictSize = ((UInt32)2 << i); break; } - if (reduceSize <= ((UInt32)3 << i)) { p->dictSize = ((UInt32)3 << i); break; } - } + UInt32 v = (UInt32)p->reduceSize; + const UInt32 kReduceMin = ((UInt32)1 << 12); + if (v < kReduceMin) + v = kReduceMin; + if (p->dictSize > v) + p->dictSize = v; } if (p->lc < 0) p->lc = 3; @@ -113,18 +113,85 @@ UInt32 LzmaEncProps_GetDictSize(const CLzmaEncProps *props2) return props.dictSize; } -#if defined(_MSC_VER) && (_MSC_VER >= 1400) -/* BSR code is fast for some new CPUs */ -/* #define LZMA_LOG_BSR */ + +/* +x86/x64: + +BSR: + IF (SRC == 0) ZF = 1, DEST is undefined; + AMD : DEST is unchanged; + IF (SRC != 0) ZF = 0; DEST is index of top non-zero bit + BSR is slow in some processors + +LZCNT: + IF (SRC == 0) CF = 1, DEST is size_in_bits_of_register(src) (32 or 64) + IF (SRC != 0) CF = 0, DEST = num_lead_zero_bits + IF (DEST == 0) ZF = 1; + +LZCNT works only in new processors starting from Haswell. +if LZCNT is not supported by processor, then it's executed as BSR. +LZCNT can be faster than BSR, if supported. +*/ + +// #define LZMA_LOG_BSR + +#if defined(MY_CPU_ARM_OR_ARM64) /* || defined(MY_CPU_X86_OR_AMD64) */ + + #if (defined(__clang__) && (__clang_major__ >= 6)) \ + || (defined(__GNUC__) && (__GNUC__ >= 6)) + #define LZMA_LOG_BSR + #elif defined(_MSC_VER) && (_MSC_VER >= 1300) + // #if defined(MY_CPU_ARM_OR_ARM64) + #define LZMA_LOG_BSR + // #endif + #endif #endif +// #include + #ifdef LZMA_LOG_BSR -#define kDicLogSizeMaxCompress 32 +#if defined(__clang__) \ + || defined(__GNUC__) + +/* + C code: : (30 - __builtin_clz(x)) + gcc9/gcc10 for x64 /x86 : 30 - (bsr(x) xor 31) + clang10 for x64 : 31 + (bsr(x) xor -32) +*/ + + #define MY_clz(x) ((unsigned)__builtin_clz(x)) + // __lzcnt32 + // __builtin_ia32_lzcnt_u32 + +#else // #if defined(_MSC_VER) + + #ifdef MY_CPU_ARM_OR_ARM64 + + #define MY_clz _CountLeadingZeros + + #else // if defined(MY_CPU_X86_OR_AMD64) + + // #define MY_clz __lzcnt // we can use lzcnt (unsupported by old CPU) + // _BitScanReverse code is not optimal for some MSVC compilers + #define BSR2_RET(pos, res) { unsigned long zz; _BitScanReverse(&zz, (pos)); zz--; \ + res = (zz + zz) + (pos >> zz); } + + #endif // MY_CPU_X86_OR_AMD64 + +#endif // _MSC_VER + + +#ifndef BSR2_RET -#define BSR2_RET(pos, res) { unsigned long zz; _BitScanReverse(&zz, (pos)); res = (zz + zz) + ((pos >> (zz - 1)) & 1); } + #define BSR2_RET(pos, res) { unsigned zz = 30 - MY_clz(pos); \ + res = (zz + zz) + (pos >> zz); } -static unsigned GetPosSlot1(UInt32 pos) +#endif + + +unsigned GetPosSlot1(UInt32 pos); +unsigned GetPosSlot1(UInt32 pos) { unsigned res; BSR2_RET(pos, res); @@ -133,10 +200,10 @@ static unsigned GetPosSlot1(UInt32 pos) #define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } #define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res); } -#else -#define kNumLogBits (9 + sizeof(size_t) / 2) -/* #define kNumLogBits (11 + sizeof(size_t) / 8 * 3) */ +#else // ! LZMA_LOG_BSR + +#define kNumLogBits (11 + sizeof(size_t) / 8 * 3) #define kDicLogSizeMaxCompress ((kNumLogBits - 1) * 2 + 7) @@ -183,7 +250,7 @@ static void LzmaEnc_FastPosInit(Byte *g_FastPos) #define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } #define GetPosSlot(pos, res) { if (pos < kNumFullDistances) res = p->g_FastPos[pos & (kNumFullDistances - 1)]; else BSR2_RET(pos, res); } -#endif +#endif // LZMA_LOG_BSR #define LZMA_NUM_REPS 4 @@ -319,7 +386,7 @@ typedef UInt32 CProbPrice; typedef struct { void *matchFinderObj; - IMatchFinder matchFinder; + IMatchFinder2 matchFinder; unsigned optCur; unsigned optEnd; @@ -364,10 +431,14 @@ typedef struct // begin of CMatchFinderMt is used in LZ thread CMatchFinderMt matchFinderMt; // end of CMatchFinderMt is used in BT and HASH threads + // #else + // CMatchFinder matchFinderBase; #endif - CMatchFinder matchFinderBase; + + // we suppose that we have 8-bytes alignment after CMatchFinder + #ifndef _7ZIP_ST Byte pad[128]; #endif @@ -375,8 +446,10 @@ typedef struct // LZ thread CProbPrice ProbPrices[kBitModelTotal >> kNumMoveReducingBits]; - UInt32 matches[LZMA_MATCH_LEN_MAX * 2 + 2 + 1]; + // we want {len , dist} pairs to be 8-bytes aligned in matches array + UInt32 matches[LZMA_MATCH_LEN_MAX * 2 + 2]; + // we want 8-bytes alignment here UInt32 alignPrices[kAlignTableSize]; UInt32 posSlotPrices[kNumLenToPosStates][kDistTableSizeMax]; UInt32 distancesPrices[kNumLenToPosStates][kNumFullDistances]; @@ -405,12 +478,19 @@ typedef struct CSaveState saveState; + // BoolInt mf_Failure; #ifndef _7ZIP_ST Byte pad2[128]; #endif } CLzmaEnc; +#define MFB (p->matchFinderBase) +/* +#ifndef _7ZIP_ST +#define MFB (p->matchFinderMt.MatchFinder) +#endif +*/ #define COPY_ARR(dest, src, arr) memcpy(dest->arr, src->arr, sizeof(src->arr)); @@ -475,11 +555,21 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX - || props.pb > LZMA_PB_MAX - || props.dictSize > ((UInt64)1 << kDicLogSizeMaxCompress) - || props.dictSize > kLzmaMaxHistorySize) + || props.pb > LZMA_PB_MAX) return SZ_ERROR_PARAM; + + if (props.dictSize > kLzmaMaxHistorySize) + props.dictSize = kLzmaMaxHistorySize; + + #ifndef LZMA_LOG_BSR + { + const UInt64 dict64 = props.dictSize; + if (dict64 > ((UInt64)1 << kDicLogSizeMaxCompress)) + return SZ_ERROR_PARAM; + } + #endif + p->dictSize = props.dictSize; { unsigned fb = (unsigned)props.fb; @@ -494,7 +584,7 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) p->pb = (unsigned)props.pb; p->fastMode = (props.algo == 0); // p->_maxMode = True; - p->matchFinderBase.btMode = (Byte)(props.btMode ? 1 : 0); + MFB.btMode = (Byte)(props.btMode ? 1 : 0); { unsigned numHashBytes = 4; if (props.btMode) @@ -504,10 +594,10 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) } if (props.numHashBytes >= 5) numHashBytes = 5; - p->matchFinderBase.numHashBytes = numHashBytes; + MFB.numHashBytes = numHashBytes; } - p->matchFinderBase.cutValue = props.mc; + MFB.cutValue = props.mc; p->writeEndMark = (BoolInt)props.writeEndMark; @@ -531,7 +621,7 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) void LzmaEnc_SetDataSize(CLzmaEncHandle pp, UInt64 expectedDataSiize) { CLzmaEnc *p = (CLzmaEnc *)pp; - p->matchFinderBase.expectedDataSize = expectedDataSiize; + MFB.expectedDataSize = expectedDataSiize; } @@ -578,12 +668,11 @@ static int RangeEnc_Alloc(CRangeEnc *p, ISzAllocPtr alloc) static void RangeEnc_Free(CRangeEnc *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->bufBase); - p->bufBase = 0; + p->bufBase = NULL; } static void RangeEnc_Init(CRangeEnc *p) { - /* Stream.Init(); */ p->range = 0xFFFFFFFF; p->cache = 0; p->low = 0; @@ -597,12 +686,12 @@ static void RangeEnc_Init(CRangeEnc *p) MY_NO_INLINE static void RangeEnc_FlushStream(CRangeEnc *p) { - size_t num; - if (p->res != SZ_OK) - return; - num = (size_t)(p->buf - p->bufBase); - if (num != ISeqOutStream_Write(p->outStream, p->bufBase, num)) - p->res = SZ_ERROR_WRITE; + const size_t num = (size_t)(p->buf - p->bufBase); + if (p->res == SZ_OK) + { + if (num != ISeqOutStream_Write(p->outStream, p->bufBase, num)) + p->res = SZ_ERROR_WRITE; + } p->processed += num; p->buf = p->bufBase; } @@ -1007,7 +1096,11 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes) p->additionalOffset++; p->numAvail = p->matchFinder.GetNumAvailableBytes(p->matchFinderObj); - numPairs = p->matchFinder.GetMatches(p->matchFinderObj, p->matches); + { + const UInt32 *d = p->matchFinder.GetMatches(p->matchFinderObj, p->matches); + // if (!d) { p->mf_Failure = True; *numPairsRes = 0; return 0; } + numPairs = (unsigned)(d - p->matches); + } *numPairsRes = numPairs; #ifdef SHOW_STAT @@ -1023,7 +1116,7 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes) if (numPairs == 0) return 0; { - unsigned len = p->matches[(size_t)numPairs - 2]; + const unsigned len = p->matches[(size_t)numPairs - 2]; if (len != p->numFastBytes) return len; { @@ -1033,7 +1126,7 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes) { const Byte *p1 = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; const Byte *p2 = p1 + len; - ptrdiff_t dif = (ptrdiff_t)-1 - (ptrdiff_t)p->matches[(size_t)numPairs - 1]; + const ptrdiff_t dif = (ptrdiff_t)-1 - (ptrdiff_t)p->matches[(size_t)numPairs - 1]; const Byte *lim = p1 + numAvail; for (; p2 != lim && *p2 == p2[dif]; p2++) {} @@ -1189,6 +1282,8 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) repLens[i] = len; if (len > repLens[repMaxIndex]) repMaxIndex = i; + if (len == LZMA_MATCH_LEN_MAX) // 21.03 : optimization + break; } if (repLens[repMaxIndex] >= p->numFastBytes) @@ -1201,10 +1296,12 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) } matches = p->matches; + #define MATCHES matches + // #define MATCHES p->matches if (mainLen >= p->numFastBytes) { - p->backRes = matches[(size_t)numPairs - 1] + LZMA_NUM_REPS; + p->backRes = MATCHES[(size_t)numPairs - 1] + LZMA_NUM_REPS; MOVE_POS(p, mainLen - 1) return mainLen; } @@ -1298,13 +1395,13 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) if (len < 2) len = 2; else - while (len > matches[offs]) + while (len > MATCHES[offs]) offs += 2; for (; ; len++) { COptimal *opt; - UInt32 dist = matches[(size_t)offs + 1]; + UInt32 dist = MATCHES[(size_t)offs + 1]; UInt32 price = normalMatchPrice + GET_PRICE_LEN(&p->lenEnc, posState, len); unsigned lenToPosState = GetLenToPosState(len); @@ -1328,7 +1425,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) opt->extra = 0; } - if (len == matches[offs]) + if (len == MATCHES[offs]) { offs += 2; if (offs == numPairs) @@ -1749,8 +1846,8 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) if (newLen > numAvail) { newLen = numAvail; - for (numPairs = 0; newLen > matches[numPairs]; numPairs += 2); - matches[numPairs] = (UInt32)newLen; + for (numPairs = 0; newLen > MATCHES[numPairs]; numPairs += 2); + MATCHES[numPairs] = (UInt32)newLen; numPairs += 2; } @@ -1769,9 +1866,9 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) } offs = 0; - while (startLen > matches[offs]) + while (startLen > MATCHES[offs]) offs += 2; - dist = matches[(size_t)offs + 1]; + dist = MATCHES[(size_t)offs + 1]; // if (dist >= kNumFullDistances) GetPosSlot2(dist, posSlot); @@ -1798,7 +1895,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) } } - if (len == matches[offs]) + if (len == MATCHES[offs]) { // if (p->_maxMode) { // MATCH : LIT : REP_0 @@ -1863,7 +1960,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) offs += 2; if (offs == numPairs) break; - dist = matches[(size_t)offs + 1]; + dist = MATCHES[(size_t)offs + 1]; // if (dist >= kNumFullDistances) GetPosSlot2(dist, posSlot); } @@ -2081,8 +2178,23 @@ static SRes CheckErrors(CLzmaEnc *p) return p->result; if (p->rc.res != SZ_OK) p->result = SZ_ERROR_WRITE; - if (p->matchFinderBase.result != SZ_OK) + + #ifndef _7ZIP_ST + if ( + // p->mf_Failure || + (p->mtMode && + ( // p->matchFinderMt.failure_LZ_LZ || + p->matchFinderMt.failure_LZ_BT)) + ) + { + p->result = MY_HRES_ERROR__INTERNAL_ERROR; + // printf("\nCheckErrors p->matchFinderMt.failureLZ\n"); + } + #endif + + if (MFB.result != SZ_OK) p->result = SZ_ERROR_READ; + if (p->result != SZ_OK) p->finished = True; return p->result; @@ -2223,11 +2335,11 @@ MY_NO_INLINE static void FillDistancesPrices(CLzmaEnc *p) static void LzmaEnc_Construct(CLzmaEnc *p) { RangeEnc_Construct(&p->rc); - MatchFinder_Construct(&p->matchFinderBase); + MatchFinder_Construct(&MFB); #ifndef _7ZIP_ST + p->matchFinderMt.MatchFinder = &MFB; MatchFinderMt_Construct(&p->matchFinderMt); - p->matchFinderMt.MatchFinder = &p->matchFinderBase; #endif { @@ -2243,7 +2355,6 @@ static void LzmaEnc_Construct(CLzmaEnc *p) LzmaEnc_InitPriceTables(p->ProbPrices); p->litProbs = NULL; p->saveState.litProbs = NULL; - } CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc) @@ -2269,7 +2380,7 @@ static void LzmaEnc_Destruct(CLzmaEnc *p, ISzAllocPtr alloc, ISzAllocPtr allocBi MatchFinderMt_Destruct(&p->matchFinderMt, allocBig); #endif - MatchFinder_Free(&p->matchFinderBase, allocBig); + MatchFinder_Free(&MFB, allocBig); LzmaEnc_FreeLits(p, alloc); RangeEnc_Free(&p->rc, alloc); } @@ -2287,6 +2398,12 @@ static SRes LzmaEnc_CodeOneBlock(CLzmaEnc *p, UInt32 maxPackSize, UInt32 maxUnpa UInt32 nowPos32, startPos32; if (p->needInit) { + #ifndef _7ZIP_ST + if (p->mtMode) + { + RINOK(MatchFinderMt_InitMt(&p->matchFinderMt)); + } + #endif p->matchFinder.Init(p->matchFinderObj); p->needInit = 0; } @@ -2582,11 +2699,13 @@ static SRes LzmaEnc_CodeOneBlock(CLzmaEnc *p, UInt32 maxPackSize, UInt32 maxUnpa static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig) { UInt32 beforeSize = kNumOpts; + UInt32 dictSize; + if (!RangeEnc_Alloc(&p->rc, alloc)) return SZ_ERROR_MEM; #ifndef _7ZIP_ST - p->mtMode = (p->multiThread && !p->fastMode && (p->matchFinderBase.btMode != 0)); + p->mtMode = (p->multiThread && !p->fastMode && (MFB.btMode != 0)); #endif { @@ -2605,30 +2724,50 @@ static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, } } - p->matchFinderBase.bigHash = (Byte)(p->dictSize > kBigHashDicLimit ? 1 : 0); + MFB.bigHash = (Byte)(p->dictSize > kBigHashDicLimit ? 1 : 0); + + + dictSize = p->dictSize; + if (dictSize == ((UInt32)2 << 30) || + dictSize == ((UInt32)3 << 30)) + { + /* 21.03 : here we reduce the dictionary for 2 reasons: + 1) we don't want 32-bit back_distance matches in decoder for 2 GB dictionary. + 2) we want to elimate useless last MatchFinder_Normalize3() for corner cases, + where data size is aligned for 1 GB: 5/6/8 GB. + That reducing must be >= 1 for such corner cases. */ + dictSize -= 1; + } + + if (beforeSize + dictSize < keepWindowSize) + beforeSize = keepWindowSize - dictSize; - if (beforeSize + p->dictSize < keepWindowSize) - beforeSize = keepWindowSize - p->dictSize; + /* in worst case we can look ahead for + max(LZMA_MATCH_LEN_MAX, numFastBytes + 1 + numFastBytes) bytes. + we send larger value for (keepAfter) to MantchFinder_Create(): + (numFastBytes + LZMA_MATCH_LEN_MAX + 1) + */ #ifndef _7ZIP_ST if (p->mtMode) { - RINOK(MatchFinderMt_Create(&p->matchFinderMt, p->dictSize, beforeSize, p->numFastBytes, - LZMA_MATCH_LEN_MAX - + 1 /* 18.04 */ + RINOK(MatchFinderMt_Create(&p->matchFinderMt, dictSize, beforeSize, + p->numFastBytes, LZMA_MATCH_LEN_MAX + 1 /* 18.04 */ , allocBig)); p->matchFinderObj = &p->matchFinderMt; - p->matchFinderBase.bigHash = (Byte)( - (p->dictSize > kBigHashDicLimit && p->matchFinderBase.hashMask >= 0xFFFFFF) ? 1 : 0); + MFB.bigHash = (Byte)( + (p->dictSize > kBigHashDicLimit && MFB.hashMask >= 0xFFFFFF) ? 1 : 0); MatchFinderMt_CreateVTable(&p->matchFinderMt, &p->matchFinder); } else #endif { - if (!MatchFinder_Create(&p->matchFinderBase, p->dictSize, beforeSize, p->numFastBytes, LZMA_MATCH_LEN_MAX, allocBig)) + if (!MatchFinder_Create(&MFB, dictSize, beforeSize, + p->numFastBytes, LZMA_MATCH_LEN_MAX + 1 /* 21.03 */ + , allocBig)) return SZ_ERROR_MEM; - p->matchFinderObj = &p->matchFinderBase; - MatchFinder_CreateVTable(&p->matchFinderBase, &p->matchFinder); + p->matchFinderObj = &MFB; + MatchFinder_CreateVTable(&MFB, &p->matchFinder); } return SZ_OK; @@ -2700,6 +2839,8 @@ static void LzmaEnc_Init(CLzmaEnc *p) p->pbMask = ((unsigned)1 << p->pb) - 1; p->lpMask = ((UInt32)0x100 << p->lp) - ((unsigned)0x100 >> p->lc); + + // p->mf_Failure = False; } @@ -2742,7 +2883,7 @@ static SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInS ISzAllocPtr alloc, ISzAllocPtr allocBig) { CLzmaEnc *p = (CLzmaEnc *)pp; - p->matchFinderBase.stream = inStream; + MFB.stream = inStream; p->needInit = 1; p->rc.outStream = outStream; return LzmaEnc_AllocAndInit(p, 0, alloc, allocBig); @@ -2753,16 +2894,16 @@ SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp, ISzAllocPtr alloc, ISzAllocPtr allocBig) { CLzmaEnc *p = (CLzmaEnc *)pp; - p->matchFinderBase.stream = inStream; + MFB.stream = inStream; p->needInit = 1; return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig); } static void LzmaEnc_SetInputBuf(CLzmaEnc *p, const Byte *src, SizeT srcLen) { - p->matchFinderBase.directInput = 1; - p->matchFinderBase.bufferBase = (Byte *)src; - p->matchFinderBase.directInputRem = srcLen; + MFB.directInput = 1; + MFB.bufferBase = (Byte *)src; + MFB.directInputRem = srcLen; } SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const Byte *src, SizeT srcLen, @@ -2804,9 +2945,12 @@ static size_t SeqOutStreamBuf_Write(const ISeqOutStream *pp, const void *data, s size = p->rem; p->overflow = True; } - memcpy(p->data, data, size); - p->rem -= size; - p->data += size; + if (size != 0) + { + memcpy(p->data, data, size); + p->rem -= size; + p->data += size; + } return size; } @@ -2826,6 +2970,7 @@ const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle pp) } +// (desiredPackSize == 0) is not allowed SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit, Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize) { @@ -2846,14 +2991,10 @@ SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit, if (reInit) LzmaEnc_Init(p); LzmaEnc_InitPrices(p); - - nowPos64 = p->nowPos64; RangeEnc_Init(&p->rc); p->rc.outStream = &outStream.vt; - - if (desiredPackSize == 0) - return SZ_ERROR_OUTPUT_EOF; - + nowPos64 = p->nowPos64; + res = LzmaEnc_CodeOneBlock(p, desiredPackSize, *unpackSize); *unpackSize = (UInt32)(p->nowPos64 - nowPos64); @@ -2895,7 +3036,7 @@ static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress) LzmaEnc_Finish(p); /* - if (res == SZ_OK && !Inline_MatchFinder_IsFinishedOK(&p->matchFinderBase)) + if (res == SZ_OK && !Inline_MatchFinder_IsFinishedOK(&MFB)) res = SZ_ERROR_FAIL; } */ @@ -2914,29 +3055,37 @@ SRes LzmaEnc_Encode(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *i SRes LzmaEnc_WriteProperties(CLzmaEncHandle pp, Byte *props, SizeT *size) { - CLzmaEnc *p = (CLzmaEnc *)pp; - unsigned i; - UInt32 dictSize = p->dictSize; if (*size < LZMA_PROPS_SIZE) return SZ_ERROR_PARAM; *size = LZMA_PROPS_SIZE; - props[0] = (Byte)((p->pb * 5 + p->lp) * 9 + p->lc); - - if (dictSize >= ((UInt32)1 << 22)) - { - const UInt32 kDictMask = ((UInt32)1 << 20) - 1; - if (dictSize < (UInt32)0xFFFFFFFF - kDictMask) - dictSize = (dictSize + kDictMask) & ~kDictMask; - } - else for (i = 11; i <= 30; i++) { - if (dictSize <= ((UInt32)2 << i)) { dictSize = ((UInt32)2 << i); break; } - if (dictSize <= ((UInt32)3 << i)) { dictSize = ((UInt32)3 << i); break; } - } + const CLzmaEnc *p = (const CLzmaEnc *)pp; + const UInt32 dictSize = p->dictSize; + UInt32 v; + props[0] = (Byte)((p->pb * 5 + p->lp) * 9 + p->lc); + + // we write aligned dictionary value to properties for lzma decoder + if (dictSize >= ((UInt32)1 << 21)) + { + const UInt32 kDictMask = ((UInt32)1 << 20) - 1; + v = (dictSize + kDictMask) & ~kDictMask; + if (v < dictSize) + v = dictSize; + } + else + { + unsigned i = 11 * 2; + do + { + v = (UInt32)(2 + (i & 1)) << (i >> 1); + i++; + } + while (v < dictSize); + } - for (i = 0; i < 4; i++) - props[1 + i] = (Byte)(dictSize >> (8 * i)); - return SZ_OK; + SetUi32(props + 1, v); + return SZ_OK; + } } diff --git a/deps/LZMA-SDK/C/MtCoder.c b/deps/LZMA-SDK/C/MtCoder.c index 85444f484..e39d9cb19 100644 --- a/deps/LZMA-SDK/C/MtCoder.c +++ b/deps/LZMA-SDK/C/MtCoder.c @@ -1,5 +1,5 @@ /* MtCoder.c -- Multi-thread Coder -2021-02-09 : Igor Pavlov : Public domain */ +2021-12-21 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -44,7 +44,7 @@ static WRes ArEvent_OptCreate_And_Reset(CEvent *p) } -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp); +static THREAD_FUNC_DECL ThreadFunc(void *pp); static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) @@ -335,7 +335,7 @@ static SRes ThreadFunc2(CMtCoderThread *t) } -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp) +static THREAD_FUNC_DECL ThreadFunc(void *pp) { CMtCoderThread *t = (CMtCoderThread *)pp; for (;;) @@ -495,12 +495,7 @@ SRes MtCoder_Code(CMtCoder *p) { RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->readEvent)); - - if (Semaphore_IsCreated(&p->blocksSemaphore)) - { - RINOK_THREAD(Semaphore_Close(&p->blocksSemaphore)); - } - RINOK_THREAD(Semaphore_Create(&p->blocksSemaphore, numBlocksMax, numBlocksMax)); + RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, numBlocksMax, numBlocksMax)); } for (i = 0; i < MTCODER__BLOCKS_MAX - 1; i++) diff --git a/deps/LZMA-SDK/C/MtDec.c b/deps/LZMA-SDK/C/MtDec.c index 24441b3a7..854087b92 100644 --- a/deps/LZMA-SDK/C/MtDec.c +++ b/deps/LZMA-SDK/C/MtDec.c @@ -1,5 +1,5 @@ /* MtDec.c -- Multi-thread Decoder -2021-02-27 : Igor Pavlov : Public domain */ +2021-12-21 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -102,7 +102,7 @@ typedef struct __CMtDecBufLink CMtDecBufLink; -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp); +static THREAD_FUNC_DECL ThreadFunc(void *pp); static WRes MtDecThread_CreateEvents(CMtDecThread *t) @@ -836,7 +836,7 @@ static WRes ThreadFunc2(CMtDecThread *t) #endif -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc1(void *pp) +static THREAD_FUNC_DECL ThreadFunc1(void *pp) { WRes res; @@ -862,7 +862,7 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc1(void *pp) return (THREAD_FUNC_RET_TYPE)(UINT_PTR)res; } -static MY_NO_INLINE THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp) +static MY_NO_INLINE THREAD_FUNC_DECL ThreadFunc(void *pp) { #ifdef USE_ALLOCA CMtDecThread *t = (CMtDecThread *)pp; diff --git a/deps/LZMA-SDK/C/Threads.c b/deps/LZMA-SDK/C/Threads.c index 402abab01..6eb45b08a 100644 --- a/deps/LZMA-SDK/C/Threads.c +++ b/deps/LZMA-SDK/C/Threads.c @@ -1,11 +1,11 @@ /* Threads.c -- multithreading library -2021-04-25 : Igor Pavlov : Public domain */ +2021-12-21 : Igor Pavlov : Public domain */ #include "Precomp.h" #ifdef _WIN32 -#ifndef UNDER_CE +#ifndef USE_THREADS_CreateThread #include #endif @@ -63,10 +63,10 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) { /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ - #ifdef UNDER_CE + #ifdef USE_THREADS_CreateThread DWORD threadId; - *p = CreateThread(0, 0, func, param, 0, &threadId); + *p = CreateThread(NULL, 0, func, param, 0, &threadId); #else @@ -82,7 +82,7 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) { - #ifdef UNDER_CE + #ifdef USE_THREADS_CreateThread UNUSED_VAR(affinity) return Thread_Create(p, func, param); @@ -150,6 +150,17 @@ WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount) return HandleToWRes(*p); } +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount) +{ + // if (Semaphore_IsCreated(p)) + { + WRes wres = Semaphore_Close(p); + if (wres != 0) + return wres; + } + return Semaphore_Create(p, initCount, maxCount); +} + static WRes Semaphore_Release(CSemaphore *p, LONG releaseCount, LONG *previousCount) { return BOOLToWRes(ReleaseSemaphore(*p, releaseCount, previousCount)); } WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num) @@ -158,7 +169,9 @@ WRes Semaphore_Release1(CSemaphore *p) { return Semaphore_ReleaseN(p, 1); } WRes CriticalSection_Init(CCriticalSection *p) { - /* InitializeCriticalSection can raise only STATUS_NO_MEMORY exception */ + /* InitializeCriticalSection() can raise exception: + Windows XP, 2003 : can raise a STATUS_NO_MEMORY exception + Windows Vista+ : no exceptions */ #ifdef _MSC_VER __try #endif @@ -167,7 +180,7 @@ WRes CriticalSection_Init(CCriticalSection *p) /* InitializeCriticalSectionAndSpinCount(p, 0); */ } #ifdef _MSC_VER - __except (EXCEPTION_EXECUTE_HANDLER) { return 1; } + __except (EXCEPTION_EXECUTE_HANDLER) { return ERROR_NOT_ENOUGH_MEMORY; } #endif return 0; } @@ -406,6 +419,27 @@ WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount) return 0; } + +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount) +{ + if (Semaphore_IsCreated(p)) + { + /* + WRes wres = Semaphore_Close(p); + if (wres != 0) + return wres; + */ + if (initCount > maxCount || maxCount < 1) + return EINVAL; + // return EINVAL; // for debug + p->_count = initCount; + p->_maxCount = maxCount; + return 0; + } + return Semaphore_Create(p, initCount, maxCount); +} + + WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 releaseCount) { UInt32 newCount; diff --git a/deps/LZMA-SDK/C/Threads.h b/deps/LZMA-SDK/C/Threads.h index c555c8b97..e9493afff 100644 --- a/deps/LZMA-SDK/C/Threads.h +++ b/deps/LZMA-SDK/C/Threads.h @@ -1,21 +1,25 @@ /* Threads.h -- multithreading library -2021-04-25 : Igor Pavlov : Public domain */ +2021-12-21 : Igor Pavlov : Public domain */ #ifndef __7Z_THREADS_H #define __7Z_THREADS_H #ifdef _WIN32 -#include +#include #else -#if !defined(__APPLE__) && !defined(_AIX) +#if defined(__linux__) +#if !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) #ifndef _7ZIP_AFFINITY_DISABLE #define _7ZIP_AFFINITY_SUPPORTED +// #pragma message(" ==== _7ZIP_AFFINITY_SUPPORTED") // #define _GNU_SOURCE #endif #endif +#endif #include + #endif #include "7zTypes.h" @@ -34,8 +38,14 @@ typedef HANDLE CThread; #define Thread_Close(p) HandlePtr_Close(p) // #define Thread_Wait(p) Handle_WaitObject(*(p)) +#ifdef UNDER_CE + // if (USE_THREADS_CreateThread is defined), we use _beginthreadex() + // if (USE_THREADS_CreateThread is not definned), we use CreateThread() + #define USE_THREADS_CreateThread +#endif + typedef - #ifdef UNDER_CE + #ifdef USE_THREADS_CreateThread DWORD #else unsigned @@ -86,7 +96,30 @@ typedef UInt64 CCpuSet; #define THREAD_FUNC_CALL_TYPE MY_STD_CALL -#define THREAD_FUNC_DECL THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE + +#if defined(_WIN32) && defined(__GNUC__) +/* GCC compiler for x86 32-bit uses the rule: + the stack is 16-byte aligned before CALL instruction for function calling. + But only root function main() contains instructions that + set 16-byte alignment for stack pointer. And another functions + just keep alignment, if it was set in some parent function. + + The problem: + if we create new thread in MinGW (GCC) 32-bit x86 via _beginthreadex() or CreateThread(), + the root function of thread doesn't set 16-byte alignment. + And stack frames in all child functions also will be unaligned in that case. + + Here we set (force_align_arg_pointer) attribute for root function of new thread. + Do we need (force_align_arg_pointer) also for another systems? */ + + #define THREAD_FUNC_ATTRIB_ALIGN_ARG __attribute__((force_align_arg_pointer)) + // #define THREAD_FUNC_ATTRIB_ALIGN_ARG // for debug : bad alignment in SSE functions +#else + #define THREAD_FUNC_ATTRIB_ALIGN_ARG +#endif + +#define THREAD_FUNC_DECL THREAD_FUNC_ATTRIB_ALIGN_ARG THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE + typedef THREAD_FUNC_RET_TYPE (THREAD_FUNC_CALL_TYPE * THREAD_FUNC_TYPE)(void *); WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param); WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity); @@ -122,6 +155,7 @@ typedef HANDLE CSemaphore; #define Semaphore_Close(p) HandlePtr_Close(p) #define Semaphore_Wait(p) Handle_WaitObject(*(p)) WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount); +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount); WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num); WRes Semaphore_Release1(CSemaphore *p); @@ -172,6 +206,7 @@ typedef struct _CSemaphore #define Semaphore_IsCreated(p) ((p)->_created) WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount); +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount); WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num); #define Semaphore_Release1(p) Semaphore_ReleaseN(p, 1) WRes Semaphore_Wait(CSemaphore *p); diff --git a/deps/LZMA-SDK/C/Util/Lzma/LzmaUtil.c b/deps/LZMA-SDK/C/Util/Lzma/LzmaUtil.c index 6b4293e33..4f05f1e73 100644 --- a/deps/LZMA-SDK/C/Util/Lzma/LzmaUtil.c +++ b/deps/LZMA-SDK/C/Util/Lzma/LzmaUtil.c @@ -1,5 +1,5 @@ /* LzmaUtil.c -- Test application for LZMA compression -2021-02-15 : Igor Pavlov : Public domain */ +2021-11-01 : Igor Pavlov : Public domain */ #include "../../Precomp.h" @@ -12,6 +12,7 @@ #include "../../Alloc.h" #include "../../7zFile.h" #include "../../7zVersion.h" +#include "../../LzFind.h" #include "../../LzmaDec.h" #include "../../LzmaEnc.h" @@ -195,6 +196,8 @@ static int main2(int numArgs, const char *args[], char *rs) int encodeMode; BoolInt useOutFile = False; + LzFindPrepare(); + FileSeqInStream_CreateVTable(&inStream); File_Construct(&inStream.file); inStream.wres = 0; @@ -276,7 +279,7 @@ static int main2(int numArgs, const char *args[], char *rs) int MY_CDECL main(int numArgs, const char *args[]) { - char rs[800] = { 0 }; + char rs[1000] = { 0 }; int res = main2(numArgs, args, rs); fputs(rs, stdout); return res; diff --git a/deps/LZMA-SDK/C/Util/Lzma/LzmaUtil.dsp b/deps/LZMA-SDK/C/Util/Lzma/LzmaUtil.dsp index eedde07d8..44835d598 100644 --- a/deps/LZMA-SDK/C/Util/Lzma/LzmaUtil.dsp +++ b/deps/LZMA-SDK/C/Util/Lzma/LzmaUtil.dsp @@ -134,6 +134,10 @@ SOURCE=..\..\LzFindMt.h # End Source File # Begin Source File +SOURCE=..\..\LzFindOpt.c +# End Source File +# Begin Source File + SOURCE=..\..\LzHash.h # End Source File # Begin Source File diff --git a/deps/LZMA-SDK/C/Util/Lzma/makefile b/deps/LZMA-SDK/C/Util/Lzma/makefile index 3b825f21a..9d971f666 100644 --- a/deps/LZMA-SDK/C/Util/Lzma/makefile +++ b/deps/LZMA-SDK/C/Util/Lzma/makefile @@ -8,8 +8,10 @@ LIB_OBJS = \ C_OBJS = \ $O\Alloc.obj \ + $O\CpuArch.obj \ $O\LzFind.obj \ $O\LzFindMt.obj \ + $O\LzFindOpt.obj \ $O\LzmaDec.obj \ $O\LzmaEnc.obj \ $O\7zFile.obj \ diff --git a/deps/LZMA-SDK/C/Util/Lzma/makefile.gcc b/deps/LZMA-SDK/C/Util/Lzma/makefile.gcc index 89b3e11f7..6ce77a0f2 100644 --- a/deps/LZMA-SDK/C/Util/Lzma/makefile.gcc +++ b/deps/LZMA-SDK/C/Util/Lzma/makefile.gcc @@ -8,8 +8,10 @@ OBJS = \ $O/7zFile.o \ $O/7zStream.o \ $O/Alloc.o \ + $O/CpuArch.o \ $O/LzFind.o \ $O/LzFindMt.o \ + $O/LzFindOpt.o \ $O/LzmaDec.o \ $O/LzmaEnc.o \ $O/LzmaUtil.o \ diff --git a/deps/LZMA-SDK/C/Util/LzmaLib/LzmaLib.dsp b/deps/LZMA-SDK/C/Util/LzmaLib/LzmaLib.dsp index 0d4c981c4..c267bbef6 100644 --- a/deps/LZMA-SDK/C/Util/LzmaLib/LzmaLib.dsp +++ b/deps/LZMA-SDK/C/Util/LzmaLib/LzmaLib.dsp @@ -136,6 +136,10 @@ SOURCE=..\..\LzFindMt.h # End Source File # Begin Source File +SOURCE=..\..\LzFindOpt.c +# End Source File +# Begin Source File + SOURCE=..\..\LzHash.h # End Source File # Begin Source File diff --git a/deps/LZMA-SDK/C/Util/LzmaLib/makefile b/deps/LZMA-SDK/C/Util/LzmaLib/makefile index e0f311471..bcb7496be 100644 --- a/deps/LZMA-SDK/C/Util/LzmaLib/makefile +++ b/deps/LZMA-SDK/C/Util/LzmaLib/makefile @@ -11,8 +11,10 @@ LIB_OBJS = \ C_OBJS = \ $O\Alloc.obj \ + $O\CpuArch.obj \ $O\LzFind.obj \ $O\LzFindMt.obj \ + $O\LzFindOpt.obj \ $O\LzmaDec.obj \ $O\LzmaEnc.obj \ $O\LzmaLib.obj \ diff --git a/deps/LZMA-SDK/C/XzDec.c b/deps/LZMA-SDK/C/XzDec.c index d345f68c1..49329f16a 100644 --- a/deps/LZMA-SDK/C/XzDec.c +++ b/deps/LZMA-SDK/C/XzDec.c @@ -1,5 +1,5 @@ /* XzDec.c -- Xz Decode -2021-04-01 : Igor Pavlov : Public domain */ +2021-09-04 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -773,7 +773,8 @@ static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ - if (s == 0) return SZ_ERROR_ARCHIVE; pos += s; } + if (s == 0) return SZ_ERROR_ARCHIVE; \ + pos += s; } static BoolInt XzBlock_AreSupportedFilters(const CXzBlock *p) diff --git a/deps/LZMA-SDK/C/XzIn.c b/deps/LZMA-SDK/C/XzIn.c index 54d81c4a4..07201b842 100644 --- a/deps/LZMA-SDK/C/XzIn.c +++ b/deps/LZMA-SDK/C/XzIn.c @@ -1,5 +1,5 @@ /* XzIn.c - Xz input -2021-04-01 : Igor Pavlov : Public domain */ +2021-09-04 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -26,7 +26,8 @@ SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStream *inStream) #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ - if (s == 0) return SZ_ERROR_ARCHIVE; pos += s; } + if (s == 0) return SZ_ERROR_ARCHIVE; \ + pos += s; } SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStream *inStream, BoolInt *isIndex, UInt32 *headerSizeRes) { diff --git a/deps/LZMA-SDK/C/var_clang_x64.mak b/deps/LZMA-SDK/C/var_clang_x64.mak index d9013e1cd..b35f0cca4 100644 --- a/deps/LZMA-SDK/C/var_clang_x64.mak +++ b/deps/LZMA-SDK/C/var_clang_x64.mak @@ -9,4 +9,3 @@ USE_ASM=1 CC=$(CROSS_COMPILE)clang CXX=$(CROSS_COMPILE)clang++ USE_CLANG=1 - diff --git a/deps/LZMA-SDK/C/var_clang_x86.mak b/deps/LZMA-SDK/C/var_clang_x86.mak index 9ab916a70..d18f0ae12 100644 --- a/deps/LZMA-SDK/C/var_clang_x86.mak +++ b/deps/LZMA-SDK/C/var_clang_x86.mak @@ -9,4 +9,3 @@ USE_ASM=1 CC=$(CROSS_COMPILE)clang CXX=$(CROSS_COMPILE)clang++ USE_CLANG=1 - diff --git a/deps/LZMA-SDK/C/var_gcc_x86.mak b/deps/LZMA-SDK/C/var_gcc_x86.mak index 9eada64e1..3adf4a84a 100644 --- a/deps/LZMA-SDK/C/var_gcc_x86.mak +++ b/deps/LZMA-SDK/C/var_gcc_x86.mak @@ -8,4 +8,3 @@ MY_ARCH=-m32 USE_ASM=1 CC=$(CROSS_COMPILE)gcc CXX=$(CROSS_COMPILE)g++ - diff --git a/deps/LZMA-SDK/C/warn_gcc.mak b/deps/LZMA-SDK/C/warn_gcc.mak index 3ae796480..8fbdb28f6 100644 --- a/deps/LZMA-SDK/C/warn_gcc.mak +++ b/deps/LZMA-SDK/C/warn_gcc.mak @@ -49,5 +49,3 @@ CFLAGS_WARN_GCC_PPMD_UNALIGNED = \ CFLAGS_WARN = $(CFLAGS_WARN_GCC_9) \ # $(CFLAGS_WARN_GCC_PPMD_UNALIGNED) - - \ No newline at end of file diff --git a/deps/LZMA-SDK/DOC/lzma-history.txt b/deps/LZMA-SDK/DOC/lzma-history.txt index 3fc19fd8b..9c5a2416c 100644 --- a/deps/LZMA-SDK/DOC/lzma-history.txt +++ b/deps/LZMA-SDK/DOC/lzma-history.txt @@ -1,6 +1,42 @@ HISTORY of the LZMA SDK ----------------------- +21.07 2021-12-26 +------------------------- +- New switches: -spm and -im!{file_path} to exclude directories from processing + for specified paths that don't contain path separator character at the end of path. +- The sorting order of files in archives was slightly changed to be more consistent + for cases where the name of some directory is the same as the prefix part of the name + of another directory or file. + + +21.06 2021-11-24 +------------------------- +- Bug in LZMA encoder in file LzmaEnc.c was fixed: + LzmaEnc_MemEncode(), LzmaEncode() and LzmaCompress() could work incorrectly, + if size value for output buffer is smaller than size required for all compressed data. + LzmaEnc_Encode() could work incorrectly, + if callback ISeqOutStream::Write() doesn't write all compressed data. + NCompress::NLzma::CEncoder::Code() could work incorrectly, + if callback ISequentialOutStream::Write() returns error code. +- Bug in versions 21.00-21.05 was fixed: + 7-Zip didn't set attributes of directories during archive extracting. + + +21.04 beta 2021-11-02 +------------------------- +- 7-Zip now reduces the number of working CPU threads for compression, + if RAM size is not enough for compression with big LZMA2 dictionary. +- 7-Zip now can create and check "file.sha256" text files that contain the list + of file names and SHA-256 checksums in format compatible with sha256sum program. + + +21.03 beta 2021-07-20 +------------------------- +- The maximum dictionary size for LZMA/LZMA2 compressing was increased to 4 GB (3840 MiB). +- Minor speed optimizations in LZMA/LZMA2 compressing. + + 21.02 alpha 2021-05-06 ------------------------- - The command line version of 7-Zip for macOS was released. diff --git a/deps/LZMA-SDK/DOC/lzma-sdk.txt b/deps/LZMA-SDK/DOC/lzma-sdk.txt index b11716938..d54aad328 100644 --- a/deps/LZMA-SDK/DOC/lzma-sdk.txt +++ b/deps/LZMA-SDK/DOC/lzma-sdk.txt @@ -1,4 +1,4 @@ -LZMA SDK 21.02 +LZMA SDK 22.01 -------------- LZMA SDK provides the documentation, samples, header files, @@ -62,14 +62,61 @@ LZMA SDK Contents UNIX/Linux version ------------------ -To compile C++ version of file->file LZMA encoding, go to directory -CPP/7zip/Bundles/LzmaCon -and call make to recompile it: - make -f makefile.gcc clean all - -In some UNIX/Linux versions you must compile LZMA with static libraries. -To compile with static libraries, you can use -LIB = -lm -static +There are several otpions to compile 7-Zip with different compilers: gcc and clang. +Also 7-Zip code contains two versions for some critical parts of code: in C and in Assembeler. +So if you compile the version with Assembeler code, you will get faster 7-Zip binary. + +7-Zip's assembler code uses the following syntax for different platforms: + +1) x86 and x86-64 (AMD64): MASM syntax. + There are 2 programs that supports MASM syntax in Linux. +' 'Asmc Macro Assembler and JWasm. But JWasm now doesn't support some + cpu instructions used in 7-Zip. + So you must install Asmc Macro Assembler in Linux, if you want to compile fastest version + of 7-Zip x86 and x86-64: + https://github.com/nidud/asmc + +2) arm64: GNU assembler for ARM64 with preprocessor. + That systax of that arm64 assembler code in 7-Zip is supported by GCC and CLANG for ARM64. + +There are different binaries that can be compiled from 7-Zip source. +There are 2 main files in folder for compiling: + makefile - that can be used for compiling Windows version of 7-Zip with nmake command + makefile.gcc - that can be used for compiling Linux/macOS versions of 7-Zip with make command + +At first you must change the current folder to folder that contains `makefile.gcc`: + + cd CPP/7zip/Bundles/Alone7z + +Then you can compile `makefile.gcc` with the command: + + make -j -f makefile.gcc + +Also there are additional "*.mak" files in folder "CPP/7zip/" that can be used to compile +7-Zip binaries with optimized code and optimzing options. + +To compile with GCC without assembler: + cd CPP/7zip/Bundles/Alone7z + make -j -f ../../cmpl_gcc.mak + +To compile with CLANG without assembler: + make -j -f ../../cmpl_clang.mak + +To compile 7-Zip for x86-64 with asmc assembler: + make -j -f ../../cmpl_gcc_x64.mak + +To compile 7-Zip for arm64 with assembler: + make -j -f ../../cmpl_gcc_arm64.mak + +To compile 7-Zip for arm64 for macOS: + make -j -f ../../cmpl_mac_arm64.mak + +Also you can change some compiler options in the mak files: + cmpl_gcc.mak + var_gcc.mak + warn_gcc.mak + + Also you can use p7zip (port of 7-Zip for POSIX systems like Unix or Linux): diff --git a/docs/changes.txt b/docs/changes.txt index a61dbe080..78deba30e 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -77,6 +77,7 @@ - Apple Driver: Updated requirements to use Apple OpenCL API to macOS 13.0 - use - Backend Checks: Describe workaround in error message when detecting more than 64 backend devices - Brain: Added sanity check and corresponding error message for invalid --brain-port values +- Dependencies: Updated LZMA SDK to 22.01 - Modules: Added support for non-zero IVs for -m 6800 (Lastpass). Also added `tools/lastpass2hashcat.py` - Open Document Format: Added support for small documents with content length < 1024 - Status Code: Add specific return code for self-test fail (-11)