diff --git a/.gitmodules b/.gitmodules
index 57051b12e11a16a5cbf0bd82b9ef1155ec8eecb0..8b181c16d5a4ab57b8bfe963e9175e76a4990926 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -27,3 +27,7 @@
 	path = third_party/lz4
 	url = https://github.com/tarantool/lz4.git
 	ignore = dirty
+[submodule "third_party/zstd"]
+	path = third_party/zstd
+	url = https://github.com/tarantool/zstd.git
+	ignore = dirty
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ade6188ccaa9cec1a9a62aef872964c309743e2..ff7769d359c75b8dadbc9236b641048ce6b9aec7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -391,6 +391,16 @@ else()
+# zstd
+# Debian: missing zstd_static.h in libzstd-dev
+# Fedora: not found
+# => always use bundled version
 # OpenSSL
diff --git a/cmake/BuildZSTD.cmake b/cmake/BuildZSTD.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..31b218fee21a6dcf380955846e4d814e2b133933
--- /dev/null
+++ b/cmake/BuildZSTD.cmake
@@ -0,0 +1,16 @@
+    add_library(zstd STATIC
+        third_party/zstd/lib/zstd_compress.c
+        third_party/zstd/lib/zstd_decompress.c
+        third_party/zstd/lib/zdict.c
+        third_party/zstd/lib/zbuff.c
+        third_party/zstd/lib/fse.c
+        third_party/zstd/lib/huff0.c
+        third_party/zstd/lib/divsufsort.c
+    )
+    set(ZSTD_LIBRARIES zstd)
+    set(ZSTD_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/third_party/zstd/lib)
+    find_package_message(ZSTD "Using bundled ZSTD"
+    add_dependencies(build_bundled_libs zstd)
diff --git a/src/box/CMakeLists.txt b/src/box/CMakeLists.txt
index 82d26a05efeb2b3e09aac3bee58a19285c63c0d5..afec422f42c0032b9b0b5c1955f397040681fc70 100644
--- a/src/box/CMakeLists.txt
+++ b/src/box/CMakeLists.txt
@@ -16,7 +16,7 @@ add_custom_target(box_generate_lua_sources
     DEPENDS ${lua_sources})
+include_directories(${LZ4_INCLUDE_DIRS} ${ZSTD_INCLUDE_DIRS})
 add_library(box STATIC
@@ -82,5 +82,5 @@ add_library(box STATIC
-target_link_libraries(box ${LZ4_LIBRARIES})
+target_link_libraries(box ${LZ4_LIBRARIES} ${ZSTD_LIBRARIES})
 add_dependencies(box build_bundled_libs)
diff --git a/src/box/phia.c b/src/box/phia.c
index f9c26d11ce1c3f1827ffdc93eae180f3262b2c20..8f6f20bca0cadb753fcab911d39718b91caf54b6 100644
--- a/src/box/phia.c
+++ b/src/box/phia.c
@@ -55,13 +55,10 @@
 #include <fcntl.h>
 #include <dirent.h>
 #include <errno.h>
-/* zstd */
-#ifdef __AVX2__
-#include <immintrin.h>
 #include <lz4.h>
 #include <lz4frame.h>
+#include <zstd_static.h>
 #include "crc32.h"
@@ -3395,3942 +3392,67 @@ uint32_t ss_timestamp(void)
 	return time(NULL);
-/* zstd git commit: 765207c54934d478488c236749b01c7d6fc63d70 */
-    zstd - standard compression library
-    Copyright (C) 2014-2015, Yann Collet.
-    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-    * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-    copyright notice, this list of conditions and the following disclaimer
-    in the documentation and/or other materials provided with the
-    distribution.
-    You can contact the author at :
-    - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
-    FSE : Finite State Entropy coder
-    Copyright (C) 2013-2015, Yann Collet.
-    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-    * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-    copyright notice, this list of conditions and the following disclaimer
-    in the documentation and/or other materials provided with the
-    distribution.
-    You can contact the author at :
-    - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
-    - Public forum : https://groups.google.com/forum/#!forum/lz4c
-/* >>>>> zstd.h */
-*  Version
-#define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
-#define ZSTD_VERSION_MINOR    0    /* for new (non-breaking) interface capabilities */
-#define ZSTD_VERSION_RELEASE  2    /* for tweaks, bug-fixes, or development */
-unsigned ZSTD_versionNumber (void);
-*  Simple one-step functions
-size_t ZSTD_compress(   void* dst, size_t maxDstSize,
-                  const void* src, size_t srcSize);
-size_t ZSTD_decompress( void* dst, size_t maxOriginalSize,
-                  const void* src, size_t compressedSize);
-ZSTD_compress() :
-    Compresses 'srcSize' bytes from buffer 'src' into buffer 'dst', of maximum size 'dstSize'.
-    Destination buffer should be sized to handle worst cases situations (input data not compressible).
-    Worst case size evaluation is provided by function ZSTD_compressBound().
-    return : the number of bytes written into buffer 'dst'
-             or an error code if it fails (which can be tested using ZSTD_isError())
-ZSTD_decompress() :
-    compressedSize : is obviously the source size
-    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
-                      It must be equal or larger than originalSize, otherwise decompression will fail.
-    return : the number of bytes decompressed into destination buffer (originalSize)
-             or an errorCode if it fails (which can be tested using ZSTD_isError())
-*  Tool functions
-size_t      ZSTD_compressBound(size_t srcSize);   /* maximum compressed size */
-/* Error Management */
-unsigned    ZSTD_isError(size_t code);         /* tells if a return value is an error code */
-const char* ZSTD_getErrorName(size_t code);    /* provides error code string (useful for debugging) */
-/* <<<<< zstd.h EOF */
-/* >>>>> zstd_static.h */
-*  Streaming functions
-typedef void* ZSTD_cctx_t;
-ZSTD_cctx_t ZSTD_createCCtx(void);
-size_t      ZSTD_freeCCtx(ZSTD_cctx_t cctx);
-size_t ZSTD_compressBegin(ZSTD_cctx_t cctx, void* dst, size_t maxDstSize);
-size_t ZSTD_compressContinue(ZSTD_cctx_t cctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
-size_t ZSTD_compressEnd(ZSTD_cctx_t cctx, void* dst, size_t maxDstSize);
-typedef void* ZSTD_dctx_t;
-ZSTD_dctx_t ZSTD_createDCtx(void);
-size_t      ZSTD_freeDCtx(ZSTD_dctx_t dctx);
-size_t ZSTD_nextSrcSizeToDecompress(ZSTD_dctx_t dctx);
-size_t ZSTD_decompressContinue(ZSTD_dctx_t dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
-  Use above functions alternatively.
-  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as input to ZSTD_decompressContinue().
-  This value is expected to be provided, precisely, as 'srcSize'.
-  Otherwise, compression will fail (result is an error code, which can be tested using ZSTD_isError() )
-  ZSTD_decompressContinue() result is the number of bytes regenerated within 'dst'.
-  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
-*  Error management
-        ITEM(ZSTD_ERROR_wrongMagicNumber) \
-        ITEM(ZSTD_ERROR_wrongSrcSize) ITEM(ZSTD_ERROR_maxDstSize_tooSmall) \
-        ITEM(ZSTD_ERROR_wrongLBlockSize) \
-        ITEM(ZSTD_ERROR_maxCode)
-typedef enum { ZSTD_LIST_ERRORS(ZSTD_GENERATE_ENUM) } ZSTD_errorCodes;   /* exposed list of errors; static linking only */
-/* <<<<< zstd_static.h EOF */
-/* >>>>> fse.h */
-*  FSE simple functions
-size_t FSE_compress(void* dst, size_t maxDstSize,
-              const void* src, size_t srcSize);
-size_t FSE_decompress(void* dst, size_t maxDstSize,
-                const void* cSrc, size_t cSrcSize);
-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
-    'dst' buffer must be already allocated, and sized to handle worst case situations.
-    Worst case size evaluation is provided by FSE_compressBound().
-    return : size of compressed data
-    Special values : if result == 0, data is uncompressible => Nothing is stored within cSrc !!
-                     if result == 1, data is one constant element x srcSize times. Use RLE compression.
-                     if FSE_isError(result), it's an error code.
-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
-    into already allocated destination buffer 'dst', of size 'maxDstSize'.
-    ** Important ** : This function doesn't decompress uncompressed nor RLE data !
-    return : size of regenerated data (<= maxDstSize)
-             or an error code, which can be tested using FSE_isError()
-size_t FSE_decompressRLE(void* dst, size_t originalSize,
-                   const void* cSrc, size_t cSrcSize);
-    Decompress specific RLE corner case (equivalent to memset()).
-    cSrcSize must be == 1. originalSize must be exact.
-    return : size of regenerated data (==originalSize)
-             or an error code, which can be tested using FSE_isError()
-Note : there is no function provided for uncompressed data, as it's just a simple memcpy()
-*  Tool functions
-size_t FSE_compressBound(size_t size);       /* maximum compressed size */
-/* Error Management */
-unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
-const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
-*  FSE advanced functions
-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
-    Both parameters can be defined as '0' to mean : use default value
-    return : size of compressed data
-             or -1 if there is an error
-size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
-   FSE detailed API
-int FSE_compress(char* dest, const char* source, int inputSize) does the following:
-1. count symbol occurrence from source[] into table count[]
-2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
-3. save normalized counters to memory buffer using writeHeader()
-4. build encoding table 'CTable' from normalized counters
-5. encode the data stream using encoding table
-int FSE_decompress(char* dest, int originalSize, const char* compressed) performs:
-1. read normalized counters with readHeader()
-2. build decoding table 'DTable' from normalized counters
-3. decode the data stream using decoding table
-The following API allows triggering specific sub-functions.
-/* *** COMPRESSION *** */
-size_t FSE_count(unsigned* count, const unsigned char* src, size_t srcSize, unsigned* maxSymbolValuePtr);
-unsigned FSE_optimalTableLog(unsigned tableLog, size_t srcSize, unsigned maxSymbolValue);
-size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t total, unsigned maxSymbolValue);
-size_t FSE_headerBound(unsigned maxSymbolValue, unsigned tableLog);
-size_t FSE_writeHeader (void* headerBuffer, size_t headerBufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-void*  FSE_createCTable (unsigned tableLog, unsigned maxSymbolValue);
-void   FSE_freeCTable (void* CTable);
-size_t FSE_buildCTable(void* CTable, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-size_t FSE_compress_usingCTable (void* dst, size_t dstSize, const void* src, size_t srcSize, const void* CTable);
-The first step is to count all symbols. FSE_count() provides one quick way to do this job.
-Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have '*maxSymbolValuePtr+1' cells.
-'source' is a table of char of size 'sourceSize'. All values within 'src' MUST be <= *maxSymbolValuePtr
-*maxSymbolValuePtr will be updated, with its real value (necessarily <= original value)
-FSE_count() will return the number of occurrence of the most frequent symbol.
-If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
-The next step is to normalize the frequencies.
-FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
-It also guarantees a minimum of 1 to any Symbol which frequency is >= 1.
-You can use input 'tableLog'==0 to mean "use default tableLog value".
-If you are unsure of which tableLog value to use, you can optionally call FSE_optimalTableLog(),
-which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
-The result of FSE_normalizeCount() will be saved into a table,
-called 'normalizedCounter', which is a table of signed short.
-'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
-The return value is tableLog if everything proceeded as expected.
-It is 0 if there is a single symbol within distribution.
-If there is an error(typically, invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
-'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeHeader().
-'header' buffer must be already allocated.
-For guaranteed success, buffer size must be at least FSE_headerBound().
-The result of the function is the number of bytes written into 'header'.
-If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()) (for example, buffer size too small).
-'normalizedCounter' can then be used to create the compression tables 'CTable'.
-The space required by 'CTable' must be already allocated. Its size is provided by FSE_sizeof_CTable().
-'CTable' must be aligned of 4 bytes boundaries.
-You can then use FSE_buildCTable() to fill 'CTable'.
-In both cases, if there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
-'CTable' can then be used to compress 'source', with FSE_compress_usingCTable().
-Similar to FSE_count(), the convention is that 'source' is assumed to be a table of char of size 'sourceSize'
-The function returns the size of compressed data (without header), or -1 if failed.
-/* *** DECOMPRESSION *** */
-size_t FSE_readHeader (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* headerBuffer, size_t hbSize);
-void*  FSE_createDTable(unsigned tableLog);
-void   FSE_freeDTable(void* DTable);
-size_t FSE_buildDTable (void* DTable, const short* const normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const void* DTable, size_t fastMode);
-If the block is RLE compressed, or uncompressed, use the relevant specific functions.
-The first step is to obtain the normalized frequencies of symbols.
-This can be performed by reading a header with FSE_readHeader().
-'normalizedCounter' must be already allocated, and have at least '*maxSymbolValuePtr+1' cells of short.
-In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
-or size the table to handle worst case situations (typically 256).
-FSE_readHeader will provide 'tableLog' and 'maxSymbolValue' stored into the header.
-The result of FSE_readHeader() is the number of bytes read from 'header'.
-The following values have special meaning :
-return 2 : there is only a single symbol value. The value is provided into the second byte of header.
-return 1 : data is uncompressed
-If there is an error, the function will return an error code, which can be tested using FSE_isError().
-The next step is to create the decompression tables 'DTable' from 'normalizedCounter'.
-This is performed by the function FSE_buildDTable().
-The space required by 'DTable' must be already allocated and properly aligned.
-One can create a DTable using FSE_createDTable().
-The function will return 1 if DTable is compatible with fastMode, 0 otherwise.
-If there is an error, the function will return an error code, which can be tested using FSE_isError().
-'DTable' can then be used to decompress 'compressed', with FSE_decompress_usingDTable().
-Only trigger fastMode if it was authorized by result of FSE_buildDTable(), otherwise decompression will fail.
-cSrcSize must be correct, otherwise decompression will fail.
-FSE_decompress_usingDTable() result will tell how many bytes were regenerated.
-If there is an error, the function will return an error code, which can be tested using FSE_isError().
-*  FSE streaming compression API
-typedef struct
-    size_t bitContainer;
-    int    bitPos;
-    char*  startPtr;
-    char*  ptr;
-} FSE_CStream_t;
-typedef struct
-    ptrdiff_t   value;
-    const void* stateTable;
-    const void* symbolTT;
-    unsigned    stateLog;
-} FSE_CState_t;
-void   FSE_initCStream(FSE_CStream_t* bitC, void* dstBuffer);
-void   FSE_initCState(FSE_CState_t* CStatePtr, const void* CTable);
-void   FSE_encodeByte(FSE_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned char symbol);
-void   FSE_addBits(FSE_CStream_t* bitC, size_t value, unsigned nbBits);
-void   FSE_flushBits(FSE_CStream_t* bitC);
-void   FSE_flushCState(FSE_CStream_t* bitC, const FSE_CState_t* CStatePtr);
-size_t FSE_closeCStream(FSE_CStream_t* bitC);
-These functions are inner components of FSE_compress_usingCTable().
-They allow creation of custom streams, mixing multiple tables and bit sources.
-A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
-So the first symbol you will encode is the last you will decode, like a lifo stack.
-You will need a few variables to track your CStream. They are :
-void* CTable;           // Provided by FSE_buildCTable()
-FSE_CStream_t bitC;     // bitStream tracking structure
-FSE_CState_t state;     // State tracking structure
-The first thing to do is to init the bitStream, and the state.
-    FSE_initCStream(&bitC, dstBuffer);
-    FSE_initState(&state, CTable);
-You can then encode your input data, byte after byte.
-FSE_encodeByte() outputs a maximum of 'tableLog' bits at a time.
-Remember decoding will be done in reverse direction.
-    FSE_encodeByte(&bitStream, &state, symbol);
-At any time, you can add any bit sequence.
-Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
-    FSE_addBits(&bitStream, bitField, nbBits);
-The above methods don't commit data to memory, they just store it into local register, for speed.
-Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
-Writing data to memory is a manual operation, performed by the flushBits function.
-    FSE_flushBits(&bitStream);
-Your last FSE encoding operation shall be to flush your last state value(s).
-    FSE_flushState(&bitStream, &state);
-You must then close the bitStream if you opened it with FSE_initCStream().
-It's possible to embed some user-info into the header, as an optionalId [0-31].
-The function returns the size in bytes of CStream.
-If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
-    size_t size = FSE_closeCStream(&bitStream, optionalId);
-*  FSE streaming decompression API
-//typedef unsigned int bitD_t;
-typedef size_t bitD_t;
-typedef struct
-    bitD_t   bitContainer;
-    unsigned bitsConsumed;
-    const char* ptr;
-    const char* start;
-} FSE_DStream_t;
-typedef struct
-    bitD_t      state;
-    const void* table;
-} FSE_DState_t;
-size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
-void   FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const void* DTable);
-unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD);
-bitD_t        FSE_readBits(FSE_DStream_t* bitD, unsigned nbBits);
-unsigned int  FSE_reloadDStream(FSE_DStream_t* bitD);
-unsigned FSE_endOfDStream(const FSE_DStream_t* bitD);
-unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
-Let's now decompose FSE_decompress_usingDTable() into its unitary elements.
-You will decode FSE-encoded symbols from the bitStream,
-and also any other bitFields you put in, **in reverse order**.
-You will need a few variables to track your bitStream. They are :
-FSE_DStream_t DStream;    // Stream context
-FSE_DState_t DState;      // State context. Multiple ones are possible
-const void* DTable;       // Decoding table, provided by FSE_buildDTable()
-U32 tableLog;             // Provided by FSE_readHeader()
-The first thing to do is to init the bitStream.
-    errorCode = FSE_initDStream(&DStream, &optionalId, srcBuffer, srcSize);
-You should then retrieve your initial state(s) (multiple ones are possible) :
-    errorCode = FSE_initDState(&DState, &DStream, DTable, tableLog);
-You can then decode your data, symbol after symbol.
-For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
-Keep in mind that symbols are decoded in reverse order, like a lifo stack (last in, first out).
-    unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
-You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
-Note : maximum allowed nbBits is 25
-    unsigned int bitField = FSE_readBits(&DStream, nbBits);
-All above operations only read from local register (which size is controlled by bitD_t==32 bits).
-Reading data from memory is manually performed by the reload method.
-    endSignal = FSE_reloadDStream(&DStream);
-FSE_reloadDStream() result tells if there is still some more data to read from DStream.
-0 : there is still some data left into the DStream.
-1 Dstream reached end of buffer, but is not yet fully extracted. It will not load data from memory any more.
-2 Dstream reached its exact end, corresponding in general to decompression completed.
-3 Dstream went too far. Decompression result is corrupted.
-When reaching end of buffer(1), progress slowly if you decode multiple symbols per loop,
-to properly detect the exact end of stream.
-After each decoded symbol, check if DStream is fully consumed using this simple test :
-    FSE_reloadDStream(&DStream) >= 2
-When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
-Checking if DStream has reached its end is performed by :
-    FSE_endOfDStream(&DStream);
-Check also the states. There might be some entropy left there, still able to decode some high probability symbol.
-    FSE_endOfDState(&DState);
-/* <<<<< fse.h EOF */
-/* >>>>> fse_static.h */
-*  Tool functions
-#define FSE_COMPRESSBOUND(size) (size + (size>>7) + FSE_MAX_HEADERSIZE)   /* Macro can be useful for static allocation */
-*  Static allocation
-/* You can statically allocate a CTable as a table of U32 using below macro */
-#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
-#define FSE_DTABLE_SIZE_U32(maxTableLog)                   ((1<<maxTableLog)+1)
-*  Error Management
-        ITEM(FSE_ERROR_tableLog_tooLarge) ITEM(FSE_ERROR_maxSymbolValue_tooLarge) \
-        ITEM(FSE_ERROR_dstSize_tooSmall) ITEM(FSE_ERROR_srcSize_wrong)\
-        ITEM(FSE_ERROR_corruptionDetected) \
-        ITEM(FSE_ERROR_maxCode)
-typedef enum { FSE_LIST_ERRORS(FSE_GENERATE_ENUM) } FSE_errorCodes;  /* enum is exposed, to detect & handle specific errors; compare function result to -enum value */
-*  FSE advanced API
-size_t FSE_countFast(unsigned* count, const unsigned char* src, size_t srcSize, unsigned* maxSymbolValuePtr);
-/* same as FSE_count(), but won't check if input really respect that all values within src are <= *maxSymbolValuePtr */
-size_t FSE_buildCTable_raw (void* CTable, unsigned nbBits);
-/* create a fake CTable, designed to not compress an input where each element uses nbBits */
-size_t FSE_buildCTable_rle (void* CTable, unsigned char symbolValue);
-/* create a fake CTable, designed to compress a single identical value */
-size_t FSE_buildDTable_raw (void* DTable, unsigned nbBits);
-/* create a fake DTable, designed to read an uncompressed bitstream where each element uses nbBits */
-size_t FSE_buildDTable_rle (void* DTable, unsigned char symbolValue);
-/* create a fake DTable, designed to always generate the same symbolValue */
-*  FSE streaming API
-bitD_t FSE_readBitsFast(FSE_DStream_t* bitD, unsigned nbBits);
-/* faster, but works only if nbBits >= 1 (otherwise, result will be corrupted) */
-unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD);
-/* faster, but works only if nbBits >= 1 (otherwise, result will be corrupted) */
-/* <<<<< fse_static.h EOF */
-/* >>>>> fse.c */
-*  Tuning parameters
-*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
-*  Increasing memory usage improves compression ratio
-*  Reduced memory usage can improve speed, due to cache effect
-*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
-*  Maximum symbol value authorized.
-*  Required for proper stack allocation */
-*  Generic function type & suffix (C template emulation)
-#endif   /* !FSE_COMMONDEFS_ONLY */
-*  Compiler specifics
-#  define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#  ifdef __GNUC__
-#    define FORCE_INLINE static inline __attribute__((always_inline))
-#  else
-#    define FORCE_INLINE static inline
-#  endif
-*  Basic Types
-#ifndef ZTYPES
-#define ZTYPES 1
-#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
-typedef  uint8_t BYTE;
-typedef uint16_t U16;
-typedef  int16_t S16;
-typedef uint32_t U32;
-typedef  int32_t S32;
-typedef uint64_t U64;
-typedef  int64_t S64;
-typedef unsigned char       BYTE;
-typedef unsigned short      U16;
-typedef   signed short      S16;
-typedef unsigned int        U32;
-typedef   signed int        S32;
-typedef unsigned long long  U64;
-typedef   signed long long  S64;
-#endif   /* MEM_ACCESS_MODULE */
-*  Memory I/O
-static unsigned FSE_isLittleEndian(void)
-    const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
-    return one.c[0];
-static U32 FSE_read32(const void* memPtr)
-    U32 val32;
-    memcpy(&val32, memPtr, 4);
-    return val32;
-static U32 FSE_readLE32(const void* memPtr)
-    if (FSE_isLittleEndian())
-        return FSE_read32(memPtr);
-    else
-    {
-        const BYTE* p = (const BYTE*)memPtr;
-        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
-    }
-static void FSE_writeLE32(void* memPtr, U32 val32)
-    if (FSE_isLittleEndian())
-    {
-        memcpy(memPtr, &val32, 4);
-    }
-    else
-    {
-        BYTE* p = (BYTE*)memPtr;
-        p[0] = (BYTE)val32;
-        p[1] = (BYTE)(val32>>8);
-        p[2] = (BYTE)(val32>>16);
-        p[3] = (BYTE)(val32>>24);
-    }
-static U64 FSE_read64(const void* memPtr)
-    U64 val64;
-    memcpy(&val64, memPtr, 8);
-    return val64;
-static U64 FSE_readLE64(const void* memPtr)
-    if (FSE_isLittleEndian())
-        return FSE_read64(memPtr);
-    else
-    {
-        const BYTE* p = (const BYTE*)memPtr;
-        return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
-                     + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
-    }
-static void FSE_writeLE64(void* memPtr, U64 val64)
-    if (FSE_isLittleEndian())
-    {
-        memcpy(memPtr, &val64, 8);
-    }
-    else
-    {
-        BYTE* p = (BYTE*)memPtr;
-        p[0] = (BYTE)val64;
-        p[1] = (BYTE)(val64>>8);
-        p[2] = (BYTE)(val64>>16);
-        p[3] = (BYTE)(val64>>24);
-        p[4] = (BYTE)(val64>>32);
-        p[5] = (BYTE)(val64>>40);
-        p[6] = (BYTE)(val64>>48);
-        p[7] = (BYTE)(val64>>56);
-    }
-static size_t FSE_readLEST(const void* memPtr)
-    if (sizeof(size_t)==4)
-        return (size_t)FSE_readLE32(memPtr);
-    else
-        return (size_t)FSE_readLE64(memPtr);
-static void FSE_writeLEST(void* memPtr, size_t val)
-    if (sizeof(size_t)==4)
-        FSE_writeLE32(memPtr, (U32)val);
-    else
-        FSE_writeLE64(memPtr, (U64)val);
-*  Constants
-*  Error Management
-#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
-*  Complex types
-typedef struct
-    int  deltaFindState;
-    U16  maxState;
-    BYTE minBitsOut;
-    /* one byte padding */
-} FSE_symbolCompressionTransform;
-typedef struct
-    U32 fakeTable[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];   /* compatible with FSE_compressU16() */
-} CTable_max_t;
-*  Internal functions
-FORCE_INLINE unsigned FSE_highbit32 (register U32 val)
-#   if defined(_MSC_VER)   /* Visual */
-    unsigned long r;
-    _BitScanReverse ( &r, val );
-    return (unsigned) r;
-#   elif defined(__GNUC__) && (GCC_VERSION >= 304)   /* GCC Intrinsic */
-    return 31 - __builtin_clz (val);
-#   else   /* Software version */
-    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
-    U32 v = val;
-    unsigned r;
-    v |= v >> 1;
-    v |= v >> 2;
-    v |= v >> 4;
-    v |= v >> 8;
-    v |= v >> 16;
-    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
-    return r;
-#   endif
-unsigned FSE_isError(size_t code) { return (code > (size_t)(-FSE_ERROR_maxCode)); }
-static const char* FSE_errorStrings[] = { FSE_LIST_ERRORS(FSE_GENERATE_STRING) };
-const char* FSE_getErrorName(size_t code)
-    static const char* codeError = "Unspecified error code";
-    if (FSE_isError(code)) return FSE_errorStrings[-(int)(code)];
-    return codeError;
-static short FSE_abs(short a)
-    return a<0? -a : a;
-*  Header bitstream management
-size_t FSE_headerBound(unsigned maxSymbolValue, unsigned tableLog)
-    size_t maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 1;
-    return maxSymbolValue ? maxHeaderSize : FSE_MAX_HEADERSIZE;
-static size_t FSE_writeHeader_generic (void* header, size_t headerBufferSize,
-                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
-                                       unsigned safeWrite)
-    BYTE* const ostart = (BYTE*) header;
-    BYTE* out = ostart;
-    BYTE* const oend = ostart + headerBufferSize;
-    int nbBits;
-    const int tableSize = 1 << tableLog;
-    int remaining;
-    int threshold;
-    U32 bitStream;
-    int bitCount;
-    unsigned charnum = 0;
-    int previous0 = 0;
-    bitStream = 0;
-    bitCount  = 0;
-    /* Table Size */
-    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
-    bitCount  += 4;
-    /* Init */
-    remaining = tableSize+1;   /* +1 for extra accuracy */
-    threshold = tableSize;
-    nbBits = tableLog+1;
-    while (remaining>1)   /* stops at 1 */
-    {
-        if (previous0)
-        {
-            unsigned start = charnum;
-            while (!normalizedCounter[charnum]) charnum++;
-            while (charnum >= start+24)
-            {
-                start+=24;
-                bitStream += 0xFFFF<<bitCount;
-                if ((!safeWrite) && (out > oend-2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
-                out[0] = (BYTE)bitStream;
-                out[1] = (BYTE)(bitStream>>8);
-                out+=2;
-                bitStream>>=16;
-            }
-            while (charnum >= start+3)
-            {
-                start+=3;
-                bitStream += 3 << bitCount;
-                bitCount += 2;
-            }
-            bitStream += (charnum-start) << bitCount;
-            bitCount += 2;
-            if (bitCount>16)
-            {
-                if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
-                out[0] = (BYTE)bitStream;
-                out[1] = (BYTE)(bitStream>>8);
-                out += 2;
-                bitStream >>= 16;
-                bitCount -= 16;
-            }
-        }
-        {
-            short count = normalizedCounter[charnum++];
-            const short max = (short)((2*threshold-1)-remaining);
-            remaining -= FSE_abs(count);
-            if (remaining<0) return (size_t)-FSE_ERROR_GENERIC;
-            count++;   /* +1 for extra accuracy */
-            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
-            bitStream += count << bitCount;
-            bitCount  += nbBits;
-            bitCount  -= (count<max);
-            previous0 = (count==1);
-            while (remaining<threshold) nbBits--, threshold>>=1;
-        }
-        if (bitCount>16)
-        {
-            if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
-            out[0] = (BYTE)bitStream;
-            out[1] = (BYTE)(bitStream>>8);
-            out += 2;
-            bitStream >>= 16;
-            bitCount -= 16;
-        }
-    }
-    /* flush remaining bitStream */
-    if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
-    out[0] = (BYTE)bitStream;
-    out[1] = (BYTE)(bitStream>>8);
-    out+= (bitCount+7) /8;
-    if (charnum > maxSymbolValue + 1) return (size_t)-FSE_ERROR_GENERIC;   /* Too many symbols written (a bit too late?) */
-    return (out-ostart);
-size_t FSE_writeHeader (void* header, size_t headerBufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
-    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported */
-    if (tableLog < FSE_MIN_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported */
-    if (headerBufferSize < FSE_headerBound(maxSymbolValue, tableLog))
-        return FSE_writeHeader_generic(header, headerBufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
-    return FSE_writeHeader_generic(header, headerBufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
-size_t FSE_readHeader (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
-                 const void* headerBuffer, size_t hbSize)
-    const BYTE* const istart = (const BYTE*) headerBuffer;
-    const BYTE* ip = istart;
-    int nbBits;
-    int remaining;
-    int threshold;
-    U32 bitStream;
-    int bitCount;
-    unsigned charnum = 0;
-    int previous0 = 0;
-    bitStream = FSE_readLE32(ip);
-    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
-    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return (size_t)-FSE_ERROR_tableLog_tooLarge;
-    bitStream >>= 4;
-    bitCount = 4;
-    *tableLogPtr = nbBits;
-    remaining = (1<<nbBits)+1;
-    threshold = 1<<nbBits;
-    nbBits++;
-    while ((remaining>1) && (charnum<=*maxSVPtr))
-    {
-        if (previous0)
-        {
-            unsigned n0 = charnum;
-            while ((bitStream & 0xFFFF) == 0xFFFF)
-            {
-                n0+=24;
-                ip+=2;
-                bitStream = FSE_readLE32(ip) >> bitCount;
-            }
-            while ((bitStream & 3) == 3)
-            {
-                n0+=3;
-                bitStream>>=2;
-                bitCount+=2;
-            }
-            n0 += bitStream & 3;
-            bitCount += 2;
-            if (n0 > *maxSVPtr) return (size_t)-FSE_ERROR_GENERIC;
-            while (charnum < n0) normalizedCounter[charnum++] = 0;
-            ip += bitCount>>3;
-            bitCount &= 7;
-            bitStream = FSE_readLE32(ip) >> bitCount;
-        }
-        {
-            const short max = (short)((2*threshold-1)-remaining);
-            short count;
-            if ((bitStream & (threshold-1)) < (U32)max)
-            {
-                count = (short)(bitStream & (threshold-1));
-                bitCount   += nbBits-1;
-            }
-            else
-            {
-                count = (short)(bitStream & (2*threshold-1));
-                if (count >= threshold) count -= max;
-                bitCount   += nbBits;
-            }
-            count--;   /* extra accuracy */
-            remaining -= FSE_abs(count);
-            normalizedCounter[charnum++] = count;
-            previous0 = !count;
-            while (remaining < threshold)
-            {
-                nbBits--;
-                threshold >>= 1;
-            }
-            ip += bitCount>>3;
-            bitCount &= 7;
-            bitStream = FSE_readLE32(ip) >> bitCount;
-        }
-    }
-    if (remaining != 1) return (size_t)-FSE_ERROR_GENERIC;
-    *maxSVPtr = charnum-1;
-    ip += bitCount>0;
-    if ((size_t)(ip-istart) >= hbSize) return (size_t)-FSE_ERROR_srcSize_wrong;   /* arguably a bit late , tbd */
-    return ip-istart;
-*  FSE Compression Code
-CTable is a variable size structure which contains :
-    U16 tableLog;
-    U16 maxSymbolValue;
-    U16 nextStateNumber[1 << tableLog];                         // This size is variable
-    FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];  // This size is variable
-Allocation is manual, since C standard does not support variable-size structures.
-size_t FSE_sizeof_CTable (unsigned maxSymbolValue, unsigned tableLog)
-    size_t size;
-    FSE_STATIC_ASSERT((size_t)FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)*4 >= sizeof(CTable_max_t));   /* A compilation error here means FSE_CTABLE_SIZE_U32 is not large enough */
-    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;
-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-    return size;
-void* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
-    size_t size;
-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-    return malloc(size);
-void  FSE_freeCTable (void* CTable)
-    free(CTable);
-unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
-    U32 tableLog = maxTableLog;
-    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
-    if ((FSE_highbit32((U32)(srcSize - 1)) - 2) < tableLog) tableLog = FSE_highbit32((U32)(srcSize - 1)) - 2;   /* Accuracy can be reduced */
-    if ((FSE_highbit32(maxSymbolValue+1)+1) > tableLog) tableLog = FSE_highbit32(maxSymbolValue+1)+1;   /* Need a minimum to safely represent all symbol values */
-    if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
-    if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
-    return tableLog;
-typedef struct
-    U32 id;
-    U32 count;
-} rank_t;
-int FSE_compareRankT(const void* r1, const void* r2)
-    const rank_t* R1 = (const rank_t*)r1;
-    const rank_t* R2 = (const rank_t*)r2;
-    return 2 * (R1->count < R2->count) - 1;
-#if 0
-static size_t FSE_adjustNormSlow(short* norm, int pointsToRemove, const unsigned* count, U32 maxSymbolValue)
-    rank_t rank[FSE_MAX_SYMBOL_VALUE+2];
-    U32 s;
-    /* Init */
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        rank[s].id = s;
-        rank[s].count = count[s];
-        if (norm[s] <= 1) rank[s].count = 0;
-    }
-    rank[maxSymbolValue+1].id = 0;
-    rank[maxSymbolValue+1].count = 0;   /* ensures comparison ends here in worst case */
-    /* Sort according to count */
-    qsort(rank, maxSymbolValue+1, sizeof(rank_t), FSE_compareRankT);
-    while(pointsToRemove)
-    {
-        int newRank = 1;
-        rank_t savedR;
-        if (norm[rank[0].id] == 1)
-            return (size_t)-FSE_ERROR_GENERIC;
-        norm[rank[0].id]--;
-        pointsToRemove--;
-        rank[0].count -= (rank[0].count + 6) >> 3;
-        if (norm[rank[0].id] == 1)
-            rank[0].count=0;
-        savedR = rank[0];
-        while (rank[newRank].count > savedR.count)
-        {
-            rank[newRank-1] = rank[newRank];
-            newRank++;
-        }
-        rank[newRank-1] = savedR;
-    }
-    return 0;
-/* Secondary normalization method.
-   To be used when primary method fails. */
-static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue)
-    U32 s;
-    U32 distributed = 0;
-    U32 ToDistribute;
-    /* Init */
-    U32 lowThreshold = (U32)(total >> tableLog);
-    U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        if (count[s] == 0)
-        {
-            norm[s]=0;
-            continue;
-        }
-        if (count[s] <= lowThreshold)
-        {
-            norm[s] = -1;
-            distributed++;
-            total -= count[s];
-            continue;
-        }
-        if (count[s] <= lowOne)
-        {
-            norm[s] = 1;
-            distributed++;
-            total -= count[s];
-            continue;
-        }
-        norm[s]=-2;
-    }
-    ToDistribute = (1 << tableLog) - distributed;
-    if ((total / ToDistribute) > lowOne)
-    {
-        /* risk of rounding to zero */
-        lowOne = (U32)((total * 3) / (ToDistribute * 2));
-        for (s=0; s<=maxSymbolValue; s++)
-        {
-            if ((norm[s] == -2) && (count[s] <= lowOne))
-            {
-                norm[s] = 1;
-                distributed++;
-                total -= count[s];
-                continue;
-            }
-        }
-        ToDistribute = (1 << tableLog) - distributed;
-    }
-    if (distributed == maxSymbolValue+1)
-    {
-        /* all values are pretty poor;
-           probably incompressible data (should have already been detected);
-           find max, then give all remaining points to max */
-        U32 maxV = 0, maxC =0;
-        for (s=0; s<=maxSymbolValue; s++)
-            if (count[s] > maxC) maxV=s, maxC=count[s];
-        norm[maxV] += ToDistribute;
-        return 0;
-    }
-    {
-        U64 const vStepLog = 62 - tableLog;
-        U64 const mid = (1ULL << (vStepLog-1)) - 1;
-        U64 const rStep = ((((U64)1<<vStepLog) * ToDistribute) + mid) / total;   /* scale on remaining */
-        U64 tmpTotal = mid;
-        for (s=0; s<=maxSymbolValue; s++)
-        {
-            if (norm[s]==-2)
-            {
-                U64 end = tmpTotal + (count[s] * rStep);
-                U32 sStart = (U32)(tmpTotal >> vStepLog);
-                U32 sEnd = (U32)(end >> vStepLog);
-                U32 weight = sEnd - sStart;
-                if (weight < 1)
-                    return (size_t)-FSE_ERROR_GENERIC;
-                norm[s] = weight;
-                tmpTotal = end;
-            }
-        }
-    }
-    return 0;
-size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
-                           const unsigned* count, size_t total,
-                           unsigned maxSymbolValue)
-    /* Sanity checks */
-    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
-    if (tableLog < FSE_MIN_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported size */
-    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported size */
-    if ((1U<<tableLog) <= maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC;   /* Too small tableLog, compression potentially impossible */
-    {
-        U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
-        U64 const scale = 62 - tableLog;
-        U64 const step = ((U64)1<<62) / total;   /* <== here, one division ! */
-        U64 const vStep = 1ULL<<(scale-20);
-        int stillToDistribute = 1<<tableLog;
-        unsigned s;
-        unsigned largest=0;
-        short largestP=0;
-        U32 lowThreshold = (U32)(total >> tableLog);
-        for (s=0; s<=maxSymbolValue; s++)
-        {
-            if (count[s] == total) return 0;
-            if (count[s] == 0)
-            {
-                normalizedCounter[s]=0;
-                continue;
-            }
-            if (count[s] <= lowThreshold)
-            {
-                normalizedCounter[s] = -1;
-                stillToDistribute--;
-            }
-            else
-            {
-                short proba = (short)((count[s]*step) >> scale);
-                if (proba<8)
-                {
-                    U64 restToBeat = vStep * rtbTable[proba];
-                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
-                }
-                if (proba > largestP)
-                {
-                    largestP=proba;
-                    largest=s;
-                }
-                normalizedCounter[s] = proba;
-                stillToDistribute -= proba;
-            }
-        }
-        if (-stillToDistribute >= (normalizedCounter[largest] >> 1))
-        {
-            /* corner case, need another normalization method */
-            size_t errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
-            if (FSE_isError(errorCode)) return errorCode;
-        }
-        else normalizedCounter[largest] += (short)stillToDistribute;
-    }
-#if 0
-    {   /* Print Table (debug) */
-        U32 s;
-        U32 nTotal = 0;
-        for (s=0; s<=maxSymbolValue; s++)
-            printf("%3i: %4i \n", s, normalizedCounter[s]);
-        for (s=0; s<=maxSymbolValue; s++)
-            nTotal += abs(normalizedCounter[s]);
-        if (nTotal != (1U<<tableLog))
-            printf("Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
-        getchar();
-    }
-    return tableLog;
-/* fake CTable, for raw (uncompressed) input */
-size_t FSE_buildCTable_raw (void* CTable, unsigned nbBits)
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSymbolValue = tableMask;
-    U16* tableU16 = ( (U16*) CTable) + 2;
-    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) ((((U32*)CTable)+1) + (tableSize>>1));
-    unsigned s;
-    /* Sanity checks */
-    if (nbBits < 1) return (size_t)-FSE_ERROR_GENERIC;             /* min size */
-    if (((size_t)CTable) & 3) return (size_t)-FSE_ERROR_GENERIC;   /* Must be allocated of 4 bytes boundaries */
-    /* header */
-    tableU16[-2] = (U16) nbBits;
-    tableU16[-1] = (U16) maxSymbolValue;
-    /* Build table */
-    for (s=0; s<tableSize; s++)
-        tableU16[s] = (U16)(tableSize + s);
-    /* Build Symbol Transformation Table */
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        symbolTT[s].minBitsOut = (BYTE)nbBits;
-        symbolTT[s].deltaFindState = s-1;
-        symbolTT[s].maxState = (U16)( (tableSize*2) - 1);   /* ensures state <= maxState */
-    }
-    return 0;
-/* fake CTable, for rle (100% always same symbol) input */
-size_t FSE_buildCTable_rle (void* CTable, BYTE symbolValue)
-    const unsigned tableSize = 1;
-    U16* tableU16 = ( (U16*) CTable) + 2;
-    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) ((U32*)CTable + 2);
-    /* safety checks */
-    if (((size_t)CTable) & 3) return (size_t)-FSE_ERROR_GENERIC;   /* Must be 4 bytes aligned */
-    /* header */
-    tableU16[-2] = (U16) 0;
-    tableU16[-1] = (U16) symbolValue;
-    /* Build table */
-    tableU16[0] = 0;
-    tableU16[1] = 0;   /* just in case */
-    /* Build Symbol Transformation Table */
-    {
-        symbolTT[symbolValue].minBitsOut = 0;
-        symbolTT[symbolValue].deltaFindState = 0;
-        symbolTT[symbolValue].maxState = (U16)(2*tableSize-1);   /* ensures state <= maxState */
-    }
-    return 0;
-void FSE_initCStream(FSE_CStream_t* bitC, void* start)
-    bitC->bitContainer = 0;
-    bitC->bitPos = 0;   /* reserved for unusedBits */
-    bitC->startPtr = (char*)start;
-    bitC->ptr = bitC->startPtr;
+typedef struct sszstdfilter sszstdfilter;
-void FSE_initCState(FSE_CState_t* statePtr, const void* CTable)
-    const U32 tableLog = ( (U16*) CTable) [0];
-    statePtr->value = (ptrdiff_t)1<<tableLog;
-    statePtr->stateTable = ((const U16*) CTable) + 2;
-    statePtr->symbolTT = (const U32*)CTable + 1 + (tableLog ? (1<<(tableLog-1)) : 1);
-    statePtr->stateLog = tableLog;
+struct sszstdfilter {
+	void *ctx;
+} sspacked;
-void FSE_addBits(FSE_CStream_t* bitC, size_t value, unsigned nbBits)
-    static const unsigned mask[] = { 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,  0xFFFFFF, 0x1FFFFFF };   /* up to 25 bits */
-    bitC->bitContainer |= (value & mask[nbBits]) << bitC->bitPos;
-    bitC->bitPos += nbBits;
+static const size_t ZSTD_blockHeaderSize = 3;
-void FSE_encodeByte(FSE_CStream_t* bitC, FSE_CState_t* statePtr, BYTE symbol)
+static int
+ss_zstdfilter_init(ssfilter *f, va_list args ssunused)
-    const FSE_symbolCompressionTransform* const symbolTT = (const FSE_symbolCompressionTransform*) statePtr->symbolTT;
-    const U16* const stateTable = (const U16*) statePtr->stateTable;
-    int nbBitsOut  = symbolTT[symbol].minBitsOut;
-    nbBitsOut -= (int)((symbolTT[symbol].maxState - statePtr->value) >> 31);
-    FSE_addBits(bitC, statePtr->value, nbBitsOut);
-    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT[symbol].deltaFindState];
+	sszstdfilter *z = (sszstdfilter*)f->priv;
+	switch (f->op) {
+	case SS_FINPUT:
+		z->ctx = ZSTD_createCCtx();
+		if (ssunlikely(z->ctx == NULL))
+			return -1;
+		break;
+	case SS_FOUTPUT:
+		z->ctx = NULL;
+		break;
+	}
+	return 0;
-void FSE_flushBits(FSE_CStream_t* bitC)
+static int
+ss_zstdfilter_free(ssfilter *f)
-    size_t nbBytes = bitC->bitPos >> 3;
-    FSE_writeLEST(bitC->ptr, bitC->bitContainer);
-    bitC->bitPos &= 7;
-    bitC->ptr += nbBytes;
-    bitC->bitContainer >>= nbBytes*8;
+	sszstdfilter *z = (sszstdfilter*)f->priv;
+	switch (f->op) {
+	case SS_FINPUT:
+		ZSTD_freeCCtx(z->ctx);
+		break;
+	case SS_FOUTPUT:
+		break;
+	}
+	return 0;
-void FSE_flushCState(FSE_CStream_t* bitC, const FSE_CState_t* statePtr)
+static int
+ss_zstdfilter_start(ssfilter *f, ssbuf *dest)
-    FSE_addBits(bitC, statePtr->value, statePtr->stateLog);
-    FSE_flushBits(bitC);
+	(void)dest;
+	sszstdfilter *z = (sszstdfilter*)f->priv;
+	size_t sz;
+	switch (f->op) {
+	case SS_FINPUT:;
+		int compressionLevel = 3; /* fast */
+		sz = ZSTD_compressBegin(z->ctx, compressionLevel);
+		if (ssunlikely(ZSTD_isError(sz)))
+			return -1;
+		break;
+	case SS_FOUTPUT:
+		/* do nothing */
+		break;
+	}
+	return 0;
-size_t FSE_closeCStream(FSE_CStream_t* bitC)
-    char* endPtr;
-    FSE_addBits(bitC, 1, 1);
-    FSE_flushBits(bitC);
-    endPtr = bitC->ptr;
-    endPtr += bitC->bitPos > 0;
-    return (endPtr - bitC->startPtr);
-size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
-                           const void* src, size_t srcSize,
-                           const void* CTable)
-    const BYTE* const istart = (const BYTE*) src;
-    const BYTE* ip;
-    const BYTE* const iend = istart + srcSize;
-    FSE_CStream_t bitC;
-    FSE_CState_t CState1, CState2;
-    /* init */
-    (void)dstSize;   /* objective : ensure it fits into dstBuffer (Todo) */
-    FSE_initCStream(&bitC, dst);
-    FSE_initCState(&CState1, CTable);
-    CState2 = CState1;
-    ip=iend;
-    /* join to even */
-    if (srcSize & 1)
-    {
-        FSE_encodeByte(&bitC, &CState1, *--ip);
-        FSE_flushBits(&bitC);
-    }
-    /* join to mod 4 */
-    if ((sizeof(size_t)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2))   /* test bit 2 */
-    {
-        FSE_encodeByte(&bitC, &CState2, *--ip);
-        FSE_encodeByte(&bitC, &CState1, *--ip);
-        FSE_flushBits(&bitC);
-    }
-    /* 2 or 4 encoding per loop */
-    while (ip>istart)
-    {
-        FSE_encodeByte(&bitC, &CState2, *--ip);
-        if (sizeof(size_t)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
-            FSE_flushBits(&bitC);
-        FSE_encodeByte(&bitC, &CState1, *--ip);
-        if (sizeof(size_t)*8 > FSE_MAX_TABLELOG*4+7 )   /* this test must be static */
-        {
-            FSE_encodeByte(&bitC, &CState2, *--ip);
-            FSE_encodeByte(&bitC, &CState1, *--ip);
-        }
-        FSE_flushBits(&bitC);
-    }
-    FSE_flushCState(&bitC, &CState2);
-    FSE_flushCState(&bitC, &CState1);
-    return FSE_closeCStream(&bitC);
-size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
-size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
-    const BYTE* const istart = (const BYTE*) src;
-    const BYTE* ip = istart;
-    BYTE* const ostart = (BYTE*) dst;
-    BYTE* op = ostart;
-    BYTE* const oend = ostart + dstSize;
-    U32   count[FSE_MAX_SYMBOL_VALUE+1];
-    S16   norm[FSE_MAX_SYMBOL_VALUE+1];
-    CTable_max_t CTable;
-    size_t errorCode;
-    /* early out */
-    if (dstSize < FSE_compressBound(srcSize)) return (size_t)-FSE_ERROR_dstSize_tooSmall;
-    if (srcSize <= 1) return srcSize;  /* Uncompressed or RLE */
-    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
-    if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
-    /* Scan input and build symbol stats */
-    errorCode = FSE_count (count, ip, srcSize, &maxSymbolValue);
-    if (FSE_isError(errorCode)) return errorCode;
-    if (errorCode == srcSize) return 1;
-    if (errorCode < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
-    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
-    errorCode = FSE_normalizeCount (norm, tableLog, count, srcSize, maxSymbolValue);
-    if (FSE_isError(errorCode)) return errorCode;
-    /* Write table description header */
-    errorCode = FSE_writeHeader (op, FSE_MAX_HEADERSIZE, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return errorCode;
-    op += errorCode;
-    /* Compress */
-    errorCode = FSE_buildCTable (&CTable, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return errorCode;
-    op += FSE_compress_usingCTable(op, oend - op, ip, srcSize, &CTable);
-    /* check compressibility */
-    if ( (size_t)(op-ostart) >= srcSize-1 )
-        return 0;
-    return op-ostart;
-size_t FSE_compress (void* dst, size_t dstSize, const void* src, size_t srcSize)
-    return FSE_compress2(dst, dstSize, src, (U32)srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG);
-*  Decompression (Byte symbols)
-typedef struct
-    U16  newState;
-    BYTE symbol;
-    BYTE nbBits;
-} FSE_decode_t;   /* size == U32 */
-/* Specific corner case : RLE compression */
-size_t FSE_decompressRLE(void* dst, size_t originalSize,
-                   const void* cSrc, size_t cSrcSize)
-    if (cSrcSize != 1) return (size_t)-FSE_ERROR_srcSize_wrong;
-    memset(dst, *(BYTE*)cSrc, originalSize);
-    return originalSize;
-size_t FSE_buildDTable_rle (void* DTable, BYTE symbolValue)
-    U32* const base32 = (U32*)DTable;
-    FSE_decode_t* const cell = (FSE_decode_t*)(base32 + 1);
-    /* Sanity check */
-    if (((size_t)DTable) & 3) return (size_t)-FSE_ERROR_GENERIC;   /* Must be allocated of 4 bytes boundaries */
-    base32[0] = 0;
-    cell->newState = 0;
-    cell->symbol = symbolValue;
-    cell->nbBits = 0;
-    return 0;
-size_t FSE_buildDTable_raw (void* DTable, unsigned nbBits)
-    U32* const base32 = (U32*)DTable;
-    FSE_decode_t* dinfo = (FSE_decode_t*)(base32 + 1);
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSymbolValue = tableMask;
-    unsigned s;
-    /* Sanity checks */
-    if (nbBits < 1) return (size_t)-FSE_ERROR_GENERIC;             /* min size */
-    if (((size_t)DTable) & 3) return (size_t)-FSE_ERROR_GENERIC;   /* Must be allocated of 4 bytes boundaries */
-    /* Build Decoding Table */
-    base32[0] = nbBits;
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        dinfo[s].newState = 0;
-        dinfo[s].symbol = (BYTE)s;
-        dinfo[s].nbBits = (BYTE)nbBits;
-    }
-    return 0;
-/* FSE_initDStream
- * Initialize a FSE_DStream_t.
- * srcBuffer must point at the beginning of an FSE block.
- * The function result is the size of the FSE_block (== srcSize).
- * If srcSize is too small, the function will return an errorCode;
- */
-size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
-    if (srcSize < 1) return (size_t)-FSE_ERROR_srcSize_wrong;
-    if (srcSize >=  sizeof(bitD_t))
-    {
-        U32 contain32;
-        bitD->start = (char*)srcBuffer;
-        bitD->ptr   = (char*)srcBuffer + srcSize - sizeof(bitD_t);
-        bitD->bitContainer = FSE_readLEST(bitD->ptr);
-        contain32 = ((BYTE*)srcBuffer)[srcSize-1];
-        if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
-        bitD->bitsConsumed = 8 - FSE_highbit32(contain32);
-    }
-    else
-    {
-        U32 contain32;
-        bitD->start = (char*)srcBuffer;
-        bitD->ptr   = bitD->start;
-        bitD->bitContainer = *(BYTE*)(bitD->start);
-        switch(srcSize)
-        {
-            case 7: bitD->bitContainer += (bitD_t)(((BYTE*)(bitD->start))[6]) << (sizeof(bitD_t)*8 - 16);
-            case 6: bitD->bitContainer += (bitD_t)(((BYTE*)(bitD->start))[5]) << (sizeof(bitD_t)*8 - 24);
-            case 5: bitD->bitContainer += (bitD_t)(((BYTE*)(bitD->start))[4]) << (sizeof(bitD_t)*8 - 32);
-            case 4: bitD->bitContainer += (bitD_t)(((BYTE*)(bitD->start))[3]) << 24;
-            case 3: bitD->bitContainer += (bitD_t)(((BYTE*)(bitD->start))[2]) << 16;
-            case 2: bitD->bitContainer += (bitD_t)(((BYTE*)(bitD->start))[1]) <<  8;
-            default:;
-        }
-        contain32 = ((BYTE*)srcBuffer)[srcSize-1];
-        if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
-        bitD->bitsConsumed = 8 - FSE_highbit32(contain32);
-        bitD->bitsConsumed += (U32)(sizeof(bitD_t) - srcSize)*8;
-    }
-    return srcSize;
-/* FSE_readBits
- * Read next n bits from the bitContainer.
- * Use the fast variant *only* if n > 0.
- * Note : for this function to work properly on 32-bits, don't read more than maxNbBits==25
- * return : value extracted.
- */
-bitD_t FSE_readBits(FSE_DStream_t* bitD, U32 nbBits)
-    bitD_t value = ((bitD->bitContainer << bitD->bitsConsumed) >> 1) >> (((sizeof(bitD_t)*8)-1)-nbBits);
-    bitD->bitsConsumed += nbBits;
-    return value;
-bitD_t FSE_readBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 */
-    bitD_t value = (bitD->bitContainer << bitD->bitsConsumed) >> ((sizeof(bitD_t)*8)-nbBits);
-    bitD->bitsConsumed += nbBits;
-    return value;
-unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
-    if (bitD->ptr >= bitD->start + sizeof(bitD_t))
-    {
-        bitD->ptr -= bitD->bitsConsumed >> 3;
-        bitD->bitsConsumed &= 7;
-        bitD->bitContainer = FSE_readLEST(bitD->ptr);
-        return 0;
-    }
-    if (bitD->ptr == bitD->start)
-    {
-        if (bitD->bitsConsumed < sizeof(bitD_t)*8) return 1;
-        if (bitD->bitsConsumed == sizeof(bitD_t)*8) return 2;
-        return 3;
-    }
-    {
-        U32 nbBytes = bitD->bitsConsumed >> 3;
-        if (bitD->ptr - nbBytes < bitD->start)
-            nbBytes = (U32)(bitD->ptr - bitD->start);  /* note : necessarily ptr > start */
-        bitD->ptr -= nbBytes;
-        bitD->bitsConsumed -= nbBytes*8;
-        bitD->bitContainer = FSE_readLEST(bitD->ptr);   /* note : necessarily srcSize > sizeof(bitD) */
-        return (bitD->ptr == bitD->start);
-    }
-void FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const void* DTable)
-    const U32* const base32 = (const U32*)DTable;
-    DStatePtr->state = FSE_readBits(bitD, base32[0]);
-    FSE_reloadDStream(bitD);
-    DStatePtr->table = base32 + 1;
-BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
-    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
-    const U32  nbBits = DInfo.nbBits;
-    BYTE symbol = DInfo.symbol;
-    bitD_t lowBits = FSE_readBits(bitD, nbBits);
-    DStatePtr->state = DInfo.newState + lowBits;
-    return symbol;
-BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
-    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
-    const U32 nbBits = DInfo.nbBits;
-    BYTE symbol = DInfo.symbol;
-    bitD_t lowBits = FSE_readBitsFast(bitD, nbBits);
-    DStatePtr->state = DInfo.newState + lowBits;
-    return symbol;
-/* FSE_endOfDStream
-   Tells if bitD has reached end of bitStream or not */
-unsigned FSE_endOfDStream(const FSE_DStream_t* bitD)
-    return FSE_reloadDStream((FSE_DStream_t*)bitD)==2;
-unsigned FSE_endOfDState(const FSE_DState_t* statePtr)
-    return statePtr->state == 0;
-FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
-          void* dst, size_t maxDstSize,
-    const void* cSrc, size_t cSrcSize,
-    const void* DTable, unsigned fast)
-    BYTE* const ostart = (BYTE*) dst;
-    BYTE* op = ostart;
-    BYTE* const omax = op + maxDstSize;
-    BYTE* const olimit = omax-3;
-    FSE_DStream_t bitD;
-    FSE_DState_t state1, state2;
-    size_t errorCode;
-    /* Init */
-    errorCode = FSE_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
-    if (FSE_isError(errorCode)) return errorCode;
-    FSE_initDState(&state1, &bitD, DTable);
-    FSE_initDState(&state2, &bitD, DTable);
-    /* 2 symbols per loop */
-    while (!FSE_reloadDStream(&bitD) && (op<olimit))
-    {
-        *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD_t)*8)    /* This test must be static */
-            FSE_reloadDStream(&bitD);
-        *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-        if (FSE_MAX_TABLELOG*4+7 < sizeof(bitD_t)*8)    /* This test must be static */
-        {
-            *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-            *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-        }
-    }
-    /* tail */
-    while (1)
-    {
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==omax) || (FSE_endOfDState(&state1) && FSE_endOfDStream(&bitD)) )
-            break;
-        *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==omax) || (FSE_endOfDState(&state2) && FSE_endOfDStream(&bitD)) )
-            break;
-        *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-    }
-    /* end ? */
-    if (FSE_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2) )
-        return op-ostart;
-    if (op==omax) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
-    return (size_t)-FSE_ERROR_corruptionDetected;
-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
-                            const void* cSrc, size_t cSrcSize,
-                            const void* DTable, size_t fastMode)
-    /* select fast mode (static) */
-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, DTable, 1);
-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, DTable, 0);
-size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
-    const BYTE* const istart = (const BYTE*)cSrc;
-    const BYTE* ip = istart;
-    short counting[FSE_MAX_SYMBOL_VALUE+1];
-    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
-    unsigned tableLog;
-    size_t errorCode, fastMode;
-    if (cSrcSize<2) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
-    /* normal FSE decoding mode */
-    errorCode = FSE_readHeader (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
-    if (FSE_isError(errorCode)) return errorCode;
-    if (errorCode >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
-    ip += errorCode;
-    cSrcSize -= errorCode;
-    fastMode = FSE_buildDTable (DTable, counting, maxSymbolValue, tableLog);
-    if (FSE_isError(fastMode)) return fastMode;
-    /* always return, even if it is an error code */
-    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, DTable, fastMode);
-#endif   /* FSE_COMMONDEFS_ONLY */
-  2nd part of the file
-  designed to be included
-  for type-specific functions (template equivalent in C)
-  Objective is to write such functions only once, for better maintenance
-/* safety checks */
-#  error "FSE_FUNCTION_EXTENSION must be defined"
-#  error "FSE_FUNCTION_TYPE must be defined"
-/* Function names */
-#define FSE_CAT(X,Y) X##Y
-/* Function templates */
-size_t FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (unsigned* count, const FSE_FUNCTION_TYPE* source, size_t sourceSize, unsigned* maxSymbolValuePtr, unsigned safe)
-    const FSE_FUNCTION_TYPE* ip = source;
-    const FSE_FUNCTION_TYPE* const iend = ip+sourceSize;
-    unsigned maxSymbolValue = *maxSymbolValuePtr;
-    unsigned max=0;
-    int s;
-    U32 Counting1[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
-    U32 Counting2[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
-    U32 Counting3[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
-    U32 Counting4[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
-    /* safety checks */
-    if (!sourceSize)
-    {
-        memset(count, 0, (maxSymbolValue + 1) * sizeof(FSE_FUNCTION_TYPE));
-        *maxSymbolValuePtr = 0;
-        return 0;
-    }
-    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return (size_t)-FSE_ERROR_GENERIC;   /* maxSymbolValue too large : unsupported */
-    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;            /* 0 == default */
-    if ((safe) || (sizeof(FSE_FUNCTION_TYPE)>1))
-    {
-        /* check input values, to avoid count table overflow */
-        while (ip < iend-3)
-        {
-            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting1[*ip++]++;
-            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting2[*ip++]++;
-            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting3[*ip++]++;
-            if (*ip>maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC; Counting4[*ip++]++;
-        }
-    }
-    else
-    {
-        U32 cached = FSE_read32(ip); ip += 4;
-        while (ip < iend-15)
-        {
-            U32 c = cached; cached = FSE_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = FSE_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = FSE_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = FSE_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-        }
-        ip-=4;
-    }
-    /* finish last symbols */
-    while (ip<iend) { if ((safe) && (*ip>maxSymbolValue)) return (size_t)-FSE_ERROR_GENERIC; Counting1[*ip++]++; }
-    for (s=0; s<=(int)maxSymbolValue; s++)
-    {
-        count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
-        if (count[s] > max) max = count[s];
-    }
-    while (!count[maxSymbolValue]) maxSymbolValue--;
-    *maxSymbolValuePtr = maxSymbolValue;
-    return (int)max;
-/* hidden fast variant (unsafe) */
-size_t FSE_FUNCTION_NAME(FSE_countFast, FSE_FUNCTION_EXTENSION) (unsigned* count, const FSE_FUNCTION_TYPE* source, size_t sourceSize, unsigned* maxSymbolValuePtr)
-    return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, source, sourceSize, maxSymbolValuePtr, 0);
-size_t FSE_FUNCTION_NAME(FSE_count, FSE_FUNCTION_EXTENSION) (unsigned* count, const FSE_FUNCTION_TYPE* source, size_t sourceSize, unsigned* maxSymbolValuePtr)
-    if ((sizeof(FSE_FUNCTION_TYPE)==1) && (*maxSymbolValuePtr >= 255))
-    {
-        *maxSymbolValuePtr = 255;
-        return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, source, sourceSize, maxSymbolValuePtr, 0);
-    }
-    return FSE_FUNCTION_NAME(FSE_count_generic, FSE_FUNCTION_EXTENSION) (count, source, sourceSize, maxSymbolValuePtr, 1);
-static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
-(void* CTable, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
-    const unsigned tableSize = 1 << tableLog;
-    const unsigned tableMask = tableSize - 1;
-    U16* tableU16 = ( (U16*) CTable) + 2;
-    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) (((U32*)CTable) + 1 + (tableLog ? tableSize>>1 : 1) );
-    const unsigned step = FSE_tableStep(tableSize);
-    unsigned cumul[FSE_MAX_SYMBOL_VALUE+2];
-    U32 position = 0;
-    U32 highThreshold = tableSize-1;
-    unsigned symbol;
-    unsigned i;
-    /* safety checks */
-    if (((size_t)CTable) & 3) return (size_t)-FSE_ERROR_GENERIC;   /* Must be allocated of 4 bytes boundaries */
-    /* header */
-    tableU16[-2] = (U16) tableLog;
-    tableU16[-1] = (U16) maxSymbolValue;
-    /* For explanations on how to distribute symbol values over the table :
-    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
-    /* symbol start positions */
-    cumul[0] = 0;
-    for (i=1; i<=maxSymbolValue+1; i++)
-    {
-        if (normalizedCounter[i-1]==-1)   /* Low prob symbol */
-        {
-            cumul[i] = cumul[i-1] + 1;
-            tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(i-1);
-        }
-        else
-            cumul[i] = cumul[i-1] + normalizedCounter[i-1];
-    }
-    cumul[maxSymbolValue+1] = tableSize+1;
-    /* Spread symbols */
-    for (symbol=0; symbol<=maxSymbolValue; symbol++)
-    {
-        int nbOccurences;
-        for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++)
-        {
-            tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
-            position = (position + step) & tableMask;
-            while (position > highThreshold) position = (position + step) & tableMask;   /* Lowprob area */
-        }
-    }
-    if (position!=0) return (size_t)-FSE_ERROR_GENERIC;   /* Must have gone through all positions */
-    /* Build table */
-    for (i=0; i<tableSize; i++)
-    {
-        FSE_FUNCTION_TYPE s = tableSymbol[i];
-        tableU16[cumul[s]++] = (U16) (tableSize+i);   // Table U16 : sorted by symbol order; gives next state value
-    }
-    // Build Symbol Transformation Table
-    {
-        unsigned s;
-        unsigned total = 0;
-        for (s=0; s<=maxSymbolValue; s++)
-        {
-            switch (normalizedCounter[s])
-            {
-            case 0:
-                break;
-            case -1:
-            case 1:
-                symbolTT[s].minBitsOut = (BYTE)tableLog;
-                symbolTT[s].deltaFindState = total - 1;
-                total ++;
-                symbolTT[s].maxState = (U16)( (tableSize*2) - 1);   /* ensures state <= maxState */
-                break;
-            default :
-                symbolTT[s].minBitsOut = (BYTE)( (tableLog-1) - FSE_highbit32 (normalizedCounter[s]-1) );
-                symbolTT[s].deltaFindState = total - normalizedCounter[s];
-                total +=  normalizedCounter[s];
-                symbolTT[s].maxState = (U16)( (normalizedCounter[s] << (symbolTT[s].minBitsOut+1)) - 1);
-            }
-        }
-    }
-    return 0;
-void* FSE_FUNCTION_NAME(FSE_createDTable, FSE_FUNCTION_EXTENSION) (unsigned tableLog)
-    return malloc( ((size_t)1<<tableLog) * sizeof (FSE_DECODE_TYPE) );
-    free(DTable);
-(void* DTable, const short* const normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
-    U32* const base32 = (U32*)DTable;
-    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (base32+1);
-    const U32 tableSize = 1 << tableLog;
-    const U32 tableMask = tableSize-1;
-    const U32 step = FSE_tableStep(tableSize);
-    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
-    U32 position = 0;
-    U32 highThreshold = tableSize-1;
-    const S16 largeLimit= 1 << (tableLog-1);
-    U32 noLarge = 1;
-    U32 s;
-    /* Sanity Checks */
-    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return (size_t)-FSE_ERROR_maxSymbolValue_tooLarge;
-    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_tableLog_tooLarge;
-    /* Init, lay down lowprob symbols */
-    base32[0] = tableLog;
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        if (normalizedCounter[s]==-1)
-        {
-            tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
-            symbolNext[s] = 1;
-        }
-        else
-        {
-            if (normalizedCounter[s] >= largeLimit) noLarge=0;
-            symbolNext[s] = normalizedCounter[s];
-        }
-    }
-    /* Spread symbols */
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        int i;
-        for (i=0; i<normalizedCounter[s]; i++)
-        {
-            tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
-            position = (position + step) & tableMask;
-            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
-        }
-    }
-    if (position!=0) return (size_t)-FSE_ERROR_GENERIC;   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
-    /* Build Decoding table */
-    {
-        U32 i;
-        for (i=0; i<tableSize; i++)
-        {
-            FSE_FUNCTION_TYPE symbol = tableDecode[i].symbol;
-            U16 nextState = symbolNext[symbol]++;
-            tableDecode[i].nbBits = (BYTE) (tableLog - FSE_highbit32 ((U32)nextState) );
-            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
-        }
-    }
-    return noLarge;
-/* <<<<< fse.c EOF */
-/* >>>>> zstd.c */
-*  Tuning parameters
-*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
-*  Increasing memory usage improves compression ratio
-*  Reduced memory usage can improve speed, due to cache effect */
-   CPU Feature Detection
- * Automated efficient unaligned memory access detection
- * Based on known hardware architectures
- * This list will be updated thanks to feedbacks
- */
-    || defined(__ARM_FEATURE_UNALIGNED) \
-    || defined(__i386__) || defined(__x86_64__) \
-    || defined(_M_IX86) || defined(_M_X64) \
-    || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_8__) \
-    || (defined(_M_ARM) && (_M_ARM >= 7))
-*  Basic Types
-#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
-typedef  uint8_t BYTE;
-typedef uint16_t U16;
-typedef  int16_t S16;
-typedef uint32_t U32;
-typedef  int32_t S32;
-typedef uint64_t U64;
-typedef unsigned char       BYTE;
-typedef unsigned short      U16;
-typedef   signed short      S16;
-typedef unsigned int        U32;
-typedef   signed int        S32;
-typedef unsigned long long  U64;
-#endif   /* MEM_ACCESS_MODULE */
-*  Constants
-static const U32 ZSTD_magicNumber = 0xFD2FB51C;   /* Initial (limited) frame format */
-#define KNUTH 2654435761
-#define BIT7 128
-#define BIT6  64
-#define BIT5  32
-#define BIT4  16
-#ifndef KB
-#define KB *(1 <<10)
-#ifndef MB
-#define MB *(1 <<20)
-#ifndef GB
-#define GB *(1U<<30)
-#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
-static const U32 g_maxDistance = 4 * BLOCKSIZE;
-static const U32 g_maxLimit = 1 GB;
-static const U32 g_searchStrength = 8;
-#define MINMATCH 4
-#define MLbits   7
-#define LLbits   6
-#define Offbits  5
-#define MaxML  ((1<<MLbits )-1)
-#define MaxLL  ((1<<LLbits )-1)
-#define MaxOff ((1<<Offbits)-1)
-#define LitFSELog  11
-#define MLFSELog   10
-#define LLFSELog   10
-#define OffFSELog   9
-#define COMMAND_NOENTROPY 7   /* to remove */
-static const size_t ZSTD_blockHeaderSize = 3;
-static const size_t ZSTD_frameHeaderSize = 4;
-*  Memory operations
-static unsigned ZSTD_32bits(void) { return sizeof(void*)==4; }
-static unsigned ZSTD_64bits(void) { return sizeof(void*)==8; }
-static unsigned ZSTD_isLittleEndian(void)
-    const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
-    return one.c[0];
-static U16    ZSTD_read16(const void* p) { return *(U16*)p; }
-static U32    ZSTD_read32(const void* p) { return *(U32*)p; }
-static size_t ZSTD_read_ARCH(const void* p) { return *(size_t*)p; }
-static void   ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
-static void   ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
-#define COPY8(d,s)    { ZSTD_copy8(d,s); d+=8; s+=8; }
-static void ZSTD_wildcopy(void* dst, const void* src, size_t length)
-    const BYTE* ip = (const BYTE*)src;
-    BYTE* op = (BYTE*)dst;
-    BYTE* const oend = op + length;
-    while (op < oend) COPY8(op, ip);
-static U32 ZSTD_readLE32(const void* memPtr)
-    if (ZSTD_isLittleEndian())
-        return ZSTD_read32(memPtr);
-    else
-    {
-        const BYTE* p = (const BYTE*)memPtr;
-        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
-    }
-static void ZSTD_writeLE32(void* memPtr, U32 val32)
-    if (ZSTD_isLittleEndian())
-    {
-        memcpy(memPtr, &val32, 4);
-    }
-    else
-    {
-        BYTE* p = (BYTE*)memPtr;
-        p[0] = (BYTE)val32;
-        p[1] = (BYTE)(val32>>8);
-        p[2] = (BYTE)(val32>>16);
-        p[3] = (BYTE)(val32>>24);
-    }
-static U32 ZSTD_readBE32(const void* memPtr)
-    const BYTE* p = (const BYTE*)memPtr;
-    return (U32)(((U32)p[0]<<24) + ((U32)p[1]<<16) + ((U32)p[2]<<8) + ((U32)p[3]<<0));
-static void ZSTD_writeBE32(void* memPtr, U32 value)
-    BYTE* const p = (BYTE* const) memPtr;
-    p[0] = (BYTE)(value>>24);
-    p[1] = (BYTE)(value>>16);
-    p[2] = (BYTE)(value>>8);
-    p[3] = (BYTE)(value>>0);
-static size_t ZSTD_writeProgressive(void* ptr, size_t value)
-    BYTE* const bStart = (BYTE* const)ptr;
-    BYTE* byte = bStart;
-    do
-    {
-        BYTE l = value & 127;
-        value >>= 7;
-        if (value) l += 128;
-        *byte++ = l;
-    } while (value);
-    return byte - bStart;
-static size_t ZSTD_readProgressive(size_t* result, const void* ptr)
-    const BYTE* const bStart = (const BYTE* const)ptr;
-    const BYTE* byte = bStart;
-    size_t r = 0;
-    U32 shift = 0;
-    do
-    {
-        r += (*byte & 127) << shift;
-        shift += 7;
-    } while (*byte++ & 128);
-    *result = r;
-    return byte - bStart;
-*  Local structures
-typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
-typedef struct
-    blockType_t blockType;
-    U32 origSize;
-} blockProperties_t;
-typedef struct {
-    void* buffer;
-    U32*  offsetStart;
-    U32*  offset;
-    BYTE* litStart;
-    BYTE* lit;
-    BYTE* litLengthStart;
-    BYTE* litLength;
-    BYTE* matchLengthStart;
-    BYTE* matchLength;
-    BYTE* dumpsStart;
-    BYTE* dumps;
-} seqStore_t;
-void ZSTD_resetSeqStore(seqStore_t* ssPtr)
-    ssPtr->offset = ssPtr->offsetStart;
-    ssPtr->lit = ssPtr->litStart;
-    ssPtr->litLength = ssPtr->litLengthStart;
-    ssPtr->matchLength = ssPtr->matchLengthStart;
-    ssPtr->dumps = ssPtr->dumpsStart;
-typedef struct
-    const BYTE* base;
-    U32 current;
-    U32 nextUpdate;
-    seqStore_t seqStore;
-#ifdef __AVX2__
-    __m256i hashTable[HASH_TABLESIZE>>3];
-    U32 hashTable[HASH_TABLESIZE];
-} cctxi_t;
-ZSTD_cctx_t ZSTD_createCCtx(void)
-    cctxi_t* ctx = (cctxi_t*) malloc( sizeof(cctxi_t) );
-    ctx->seqStore.buffer = malloc(WORKPLACESIZE);
-    ctx->seqStore.offsetStart = (U32*) (ctx->seqStore.buffer);
-    ctx->seqStore.litStart = (BYTE*) (ctx->seqStore.offsetStart + (BLOCKSIZE>>2));
-    ctx->seqStore.litLengthStart =  ctx->seqStore.litStart + BLOCKSIZE;
-    ctx->seqStore.matchLengthStart = ctx->seqStore.litLengthStart + (BLOCKSIZE>>2);
-    ctx->seqStore.dumpsStart = ctx->seqStore.matchLengthStart + (BLOCKSIZE>>2);
-    return (ZSTD_cctx_t)ctx;
-void ZSTD_resetCCtx(ZSTD_cctx_t cctx)
-    cctxi_t* ctx = (cctxi_t*)cctx;
-    ctx->base = NULL;
-    memset(ctx->hashTable, 0, HASH_TABLESIZE*4);
-size_t ZSTD_freeCCtx(ZSTD_cctx_t cctx)
-    cctxi_t* ctx = (cctxi_t*) (cctx);
-    free(ctx->seqStore.buffer);
-    free(ctx);
-    return 0;
-*  Error Management
-/* tells if a return value is an error code */
-unsigned ZSTD_isError(size_t code)
-    return (code > (size_t)(-ZSTD_ERROR_maxCode));
-static const char* ZSTD_errorStrings[] = { ZSTD_LIST_ERRORS(ZSTD_GENERATE_STRING) };
-/* provides error code string (useful for debugging) */
-const char* ZSTD_getErrorName(size_t code)
-    static const char* codeError = "Unspecified error code";
-    if (ZSTD_isError(code)) return ZSTD_errorStrings[-(int)(code)];
-    return codeError;
-*  Tool functions
-unsigned ZSTD_versionNumber (void) { return ZSTD_VERSION_NUMBER; }
-static unsigned ZSTD_highbit(U32 val)
-#   if defined(_MSC_VER)   /* Visual */
-    unsigned long r;
-    _BitScanReverse(&r, val);
-    return (unsigned)r;
-#   elif defined(__GNUC__) && (GCC_VERSION >= 304)   /* GCC Intrinsic */
-    return 31 - __builtin_clz(val);
-#   else   /* Software version */
-    static const int DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
-    U32 v = val;
-    int r;
-    v |= v >> 1;
-    v |= v >> 2;
-    v |= v >> 4;
-    v |= v >> 8;
-    v |= v >> 16;
-    r = DeBruijnClz[(U32)(v * 0x07C4ACDDU) >> 27];
-    return r;
-#   endif
-static unsigned ZSTD_NbCommonBytes (register size_t val)
-    if (ZSTD_isLittleEndian())
-    {
-        if (ZSTD_64bits())
-        {
-#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r = 0;
-            _BitScanForward64( &r, (U64)val );
-            return (int)(r>>3);
-#       elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_ctzll((U64)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#       endif
-        }
-        else /* 32 bits */
-        {
-#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r;
-            _BitScanForward( &r, (U32)val );
-            return (int)(r>>3);
-#       elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_ctz((U32)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#       endif
-        }
-    }
-    else   /* Big Endian CPU */
-    {
-        if (ZSTD_64bits())
-        {
-#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r = 0;
-            _BitScanReverse64( &r, val );
-            return (unsigned)(r>>3);
-#       elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_clzll(val) >> 3);
-#       else
-            unsigned r;
-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-        }
-        else /* 32 bits */
-        {
-#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r = 0;
-            _BitScanReverse( &r, (unsigned long)val );
-            return (unsigned)(r>>3);
-#       elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_clz(val) >> 3);
-#       else
-            unsigned r;
-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-        }
-    }
-static unsigned ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
-    const BYTE* const pStart = pIn;
-    while ((pIn<pInLimit-(sizeof(size_t)-1)))
-    {
-        size_t diff = ZSTD_read_ARCH(pMatch) ^ ZSTD_read_ARCH(pIn);
-        if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
-        pIn += ZSTD_NbCommonBytes(diff);
-        return (unsigned)(pIn - pStart);
-    }
-    if (ZSTD_64bits()) if ((pIn<(pInLimit-3)) && (ZSTD_read32(pMatch) == ZSTD_read32(pIn))) { pIn+=4; pMatch+=4; }
-    if ((pIn<(pInLimit-1)) && (ZSTD_read16(pMatch) == ZSTD_read16(pIn))) { pIn+=2; pMatch+=2; }
-    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
-    return (unsigned)(pIn - pStart);
-*  Compression
-size_t ZSTD_compressBound(size_t srcSize)   /* maximum compressed size */
-    return FSE_compressBound(srcSize) + 12;
-static size_t ZSTD_compressRle (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
-    BYTE* const ostart = (BYTE* const)dst;
-    /* at this stage : dstSize >= FSE_compressBound(srcSize) > (ZSTD_blockHeaderSize+1) (checked by ZSTD_compressLiterals()) */
-    (void)maxDstSize;
-    ostart[ZSTD_blockHeaderSize] = *(BYTE*)src;
-    /* Build header */
-    ostart[0]  = (BYTE)(srcSize>>16);
-    ostart[1]  = (BYTE)(srcSize>>8);
-    ostart[2]  = (BYTE)srcSize;
-    ostart[0] += (BYTE)(bt_rle<<6);
-    return ZSTD_blockHeaderSize+1;
-static size_t ZSTD_noCompressBlock (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
-    BYTE* const ostart = (BYTE* const)dst;
-    if (srcSize + ZSTD_blockHeaderSize > maxDstSize) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
-    memcpy(ostart + ZSTD_blockHeaderSize, src, srcSize);
-    /* Build header */
-    ostart[0] = (BYTE)(srcSize>>16);
-    ostart[1] = (BYTE)(srcSize>>8);
-    ostart[2] = (BYTE)srcSize;
-    ostart[0] += (BYTE)(bt_raw<<6);   /* is a raw (uncompressed) block */
-    return ZSTD_blockHeaderSize+srcSize;
-/* return : size of CStream in bits */
-static size_t ZSTD_compressLiterals_usingCTable(void* dst, size_t dstSize,
-                                          const void* src, size_t srcSize,
-                                          const void* CTable)
-    const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
-    const BYTE* const iend = istart + srcSize;
-    FSE_CStream_t bitC;
-    FSE_CState_t CState1, CState2;
-    /* init */
-    (void)dstSize;   // objective : ensure it fits into dstBuffer (Todo)
-    FSE_initCStream(&bitC, dst);
-    FSE_initCState(&CState1, CTable);
-    CState2 = CState1;
-    /* Note : at this stage, srcSize > LITERALS_NOENTROPY (checked by ZSTD_compressLiterals()) */
-    // join to mod 2
-    if (srcSize & 1)
-    {
-        FSE_encodeByte(&bitC, &CState1, *ip++);
-        FSE_flushBits(&bitC);
-    }
-    // join to mod 4
-    if ((sizeof(size_t)*8 > LitFSELog*4+7 ) && (srcSize & 2))   // test bit 2
-    {
-        FSE_encodeByte(&bitC, &CState2, *ip++);
-        FSE_encodeByte(&bitC, &CState1, *ip++);
-        FSE_flushBits(&bitC);
-    }
-    // 2 or 4 encoding per loop
-    while (ip<iend)
-    {
-        FSE_encodeByte(&bitC, &CState2, *ip++);
-        if (sizeof(size_t)*8 < LitFSELog*2+7 )   // this test must be static
-            FSE_flushBits(&bitC);
-        FSE_encodeByte(&bitC, &CState1, *ip++);
-        if (sizeof(size_t)*8 > LitFSELog*4+7 )   // this test must be static
-        {
-            FSE_encodeByte(&bitC, &CState2, *ip++);
-            FSE_encodeByte(&bitC, &CState1, *ip++);
-        }
-        FSE_flushBits(&bitC);
-    }
-    FSE_flushCState(&bitC, &CState2);
-    FSE_flushCState(&bitC, &CState1);
-    return FSE_closeCStream(&bitC);
-size_t ZSTD_minGain(size_t srcSize)
-    return (srcSize >> 6) + 1;
-static size_t ZSTD_compressLiterals (void* dst, size_t dstSize,
-                                     const void* src, size_t srcSize)
-    const BYTE* const istart = (const BYTE*) src;
-    const BYTE* ip = istart;
-    BYTE* const ostart = (BYTE*) dst;
-    BYTE* op = ostart + ZSTD_blockHeaderSize;
-    BYTE* const oend = ostart + dstSize;
-    U32 maxSymbolValue = 256;
-    U32 tableLog = LitFSELog;
-    U32 count[256];
-    S16 norm[256];
-    U32 CTable[ FSE_CTABLE_SIZE_U32(LitFSELog, 256) ];
-    size_t errorCode;
-    const size_t minGain = ZSTD_minGain(srcSize);
-    /* early out */
-    if (dstSize < FSE_compressBound(srcSize)) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
-    /* Scan input and build symbol stats */
-    errorCode = FSE_count (count, ip, srcSize, &maxSymbolValue);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    if (errorCode == srcSize) return 1;
-    //if (errorCode < ((srcSize * 7) >> 10)) return 0;
-    //if (errorCode < (srcSize >> 7)) return 0;
-    if (errorCode < (srcSize >> 6)) return 0;   /* heuristic : probably not compressible enough */
-    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
-    errorCode = (int)FSE_normalizeCount (norm, tableLog, count, srcSize, maxSymbolValue);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    /* Write table description header */
-    errorCode = FSE_writeHeader (op, FSE_MAX_HEADERSIZE, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    op += errorCode;
-    /* Compress */
-    errorCode = FSE_buildCTable (&CTable, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    errorCode = ZSTD_compressLiterals_usingCTable(op, oend - op, ip, srcSize, &CTable);
-    if (ZSTD_isError(errorCode)) return errorCode;
-    op += errorCode;
-    /* check compressibility */
-    if ( (size_t)(op-ostart) >= srcSize-minGain)
-        return 0;
-    /* Build header */
-    {
-        size_t totalSize;
-        totalSize  = op - ostart - ZSTD_blockHeaderSize;
-        ostart[0]  = (BYTE)(totalSize>>16);
-        ostart[1]  = (BYTE)(totalSize>>8);
-        ostart[2]  = (BYTE)totalSize;
-        ostart[0] += (BYTE)(bt_compressed<<6); /* is a block, is compressed */
-    }
-    return op-ostart;
-static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
-                                     const seqStore_t* seqStorePtr,
-                                     size_t lastLLSize, size_t srcSize)
-    FSE_CStream_t blockStream;
-    U32 count[256];
-    S16 norm[256];
-    size_t mostFrequent;
-    U32 max = 255;
-    U32 tableLog = 11;
-    U32 CTable_LitLength  [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL )];
-    U32 CTable_OffsetBits [FSE_CTABLE_SIZE_U32(OffFSELog,MaxOff)];
-    U32 CTable_MatchLength[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML )];
-    U32 LLtype, Offtype, MLtype;
-    const BYTE* const op_lit_start = seqStorePtr->litStart;
-    const BYTE* op_lit = seqStorePtr->lit;
-    const BYTE* const op_litLength_start = seqStorePtr->litLengthStart;
-    const BYTE* op_litLength = seqStorePtr->litLength;
-    const U32*  op_offset = seqStorePtr->offset;
-    const BYTE* op_matchLength = seqStorePtr->matchLength;
-    const size_t nbSeq = op_litLength - op_litLength_start;
-    BYTE* op;
-    BYTE offsetBits_start[BLOCKSIZE / 4];
-    BYTE* offsetBitsPtr = offsetBits_start;
-    const size_t minGain = ZSTD_minGain(srcSize);
-    const size_t maxCSize = srcSize - minGain;
-    const size_t minSeqSize = 1 /*lastL*/ + 2 /*dHead*/ + 2 /*dumpsIn*/ + 5 /*SeqHead*/ + 3 /*SeqIn*/ + 1 /*margin*/ + ZSTD_blockHeaderSize;
-    const size_t maxLSize = maxCSize > minSeqSize ? maxCSize - minSeqSize : 0;
-    BYTE* seqHead;
-    /* init */
-    op = dst;
-    /* Encode literals */
-    {
-        size_t cSize;
-        size_t litSize = op_lit - op_lit_start;
-        if (litSize <= LITERAL_NOENTROPY) cSize = ZSTD_noCompressBlock (op, maxDstSize, op_lit_start, litSize);
-        else
-        {
-            cSize = ZSTD_compressLiterals(op, maxDstSize, op_lit_start, litSize);
-            if (cSize == 1) cSize = ZSTD_compressRle (op, maxDstSize, op_lit_start, litSize);
-            else if (cSize == 0)
-            {
-                if (litSize >= maxLSize) return 0;   /* block not compressible enough */
-                cSize = ZSTD_noCompressBlock (op, maxDstSize, op_lit_start, litSize);
-            }
-        }
-        if (ZSTD_isError(cSize)) return cSize;
-        op += cSize;
-    }
-    /* Encode Sequences */
-    /* seqHeader */
-    op += ZSTD_writeProgressive(op, lastLLSize);
-    seqHead = op;
-    /* dumps */
-    {
-        size_t dumpsLength = seqStorePtr->dumps - seqStorePtr->dumpsStart;
-        if (dumpsLength < 512)
-        {
-            op[0] = (BYTE)(dumpsLength >> 8);
-            op[1] = (BYTE)(dumpsLength);
-            op += 2;
-        }
-        else
-        {
-            op[0] = 2;
-            op[1] = (BYTE)(dumpsLength>>8);
-            op[2] = (BYTE)(dumpsLength);
-            op += 3;
-        }
-        memcpy(op, seqStorePtr->dumpsStart, dumpsLength);
-        op += dumpsLength;
-    }
-    /* Encoding table of Literal Lengths */
-    max = MaxLL;
-    mostFrequent = FSE_countFast(count, seqStorePtr->litLengthStart, nbSeq, &max);
-    if (mostFrequent == nbSeq)
-    {
-        *op++ = *(seqStorePtr->litLengthStart);
-        FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
-        LLtype = bt_rle;
-    }
-    else if ((nbSeq < 64) || (mostFrequent < (nbSeq >> (LLbits-1))))
-    {
-        FSE_buildCTable_raw(CTable_LitLength, LLbits);
-        LLtype = bt_raw;
-    }
-    else
-    {
-        tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
-        FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeHeader(op, maxDstSize, norm, max, tableLog);
-        FSE_buildCTable(CTable_LitLength, norm, max, tableLog);
-        LLtype = bt_compressed;
-    }
-    /* Encoding table of Offsets */
-    {
-        /* create OffsetBits */
-        size_t i;
-        const U32* const op_offset_start = seqStorePtr->offsetStart;
-        max = MaxOff;
-        for (i=0; i<nbSeq; i++)
-        {
-            offsetBits_start[i] = (BYTE)ZSTD_highbit(op_offset_start[i]) + 1;
-            if (op_offset_start[i]==0) offsetBits_start[i]=0;
-        }
-        offsetBitsPtr += nbSeq;
-        mostFrequent = FSE_countFast(count, offsetBits_start, nbSeq, &max);
-    }
-    if (mostFrequent == nbSeq)
-    {
-        *op++ = *offsetBits_start;
-        FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
-        Offtype = bt_rle;
-    }
-    else if ((nbSeq < 64) || (mostFrequent < (nbSeq >> (Offbits-1))))
-    {
-        FSE_buildCTable_raw(CTable_OffsetBits, Offbits);
-        Offtype = bt_raw;
-    }
-    else
-    {
-        tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
-        FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeHeader(op, maxDstSize, norm, max, tableLog);
-        FSE_buildCTable(CTable_OffsetBits, norm, max, tableLog);
-        Offtype = bt_compressed;
-    }
-    /* Encoding Table of MatchLengths */
-    max = MaxML;
-    mostFrequent = FSE_countFast(count, seqStorePtr->matchLengthStart, nbSeq, &max);
-    if (mostFrequent == nbSeq)
-    {
-        *op++ = *seqStorePtr->matchLengthStart;
-        FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
-        MLtype = bt_rle;
-    }
-    else if ((nbSeq < 64) || (mostFrequent < (nbSeq >> (MLbits-1))))
-    {
-        FSE_buildCTable_raw(CTable_MatchLength, MLbits);
-        MLtype = bt_raw;
-    }
-    else
-    {
-        tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
-        FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeHeader(op, maxDstSize, norm, max, tableLog);
-        FSE_buildCTable(CTable_MatchLength, norm, max, tableLog);
-        MLtype = bt_compressed;
-    }
-    seqHead[0] += (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
-    /* Encoding */
-    {
-        FSE_CState_t stateMatchLength;
-        FSE_CState_t stateOffsetBits;
-        FSE_CState_t stateLitLength;
-        FSE_initCStream(&blockStream, op);
-        FSE_initCState(&stateMatchLength, CTable_MatchLength);
-        FSE_initCState(&stateOffsetBits, CTable_OffsetBits);
-        FSE_initCState(&stateLitLength, CTable_LitLength);
-        while (op_litLength > op_litLength_start)
-        {
-            BYTE matchLength = *(--op_matchLength);
-            U32  offset = *(--op_offset);
-            BYTE offCode = *(--offsetBitsPtr);                              /* 32b*/  /* 64b*/
-            U32 nbBits = (offCode-1) * (!!offCode);
-            BYTE litLength = *(--op_litLength);                             /* (7)*/  /* (7)*/
-            FSE_encodeByte(&blockStream, &stateMatchLength, matchLength);   /* 17 */  /* 17 */
-            if (ZSTD_32bits()) FSE_flushBits(&blockStream);                 /*  7 */
-            FSE_addBits(&blockStream, offset, nbBits);                      /* 32 */  /* 42 */
-            if (ZSTD_32bits()) FSE_flushBits(&blockStream);                 /*  7 */
-            FSE_encodeByte(&blockStream, &stateOffsetBits, offCode);        /* 16 */  /* 51 */
-            FSE_encodeByte(&blockStream, &stateLitLength, litLength);       /* 26 */  /* 61 */
-            FSE_flushBits(&blockStream);                                    /*  7 */  /*  7 */
-        }
-        FSE_flushCState(&blockStream, &stateMatchLength);
-        FSE_flushCState(&blockStream, &stateOffsetBits);
-        FSE_flushCState(&blockStream, &stateLitLength);
-    }
-    op += FSE_closeCStream(&blockStream);
-    /* check compressibility */
-    if ((size_t)(op-dst) >= maxCSize) return 0;
-    return op - dst;
-static void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, size_t offset, size_t matchLength)
-    BYTE* op_lit = seqStorePtr->lit;
-    BYTE* const l_end = op_lit + litLength;
-    /* copy Literals */
-    while (op_lit<l_end) COPY8(op_lit, literals);
-    seqStorePtr->lit += litLength;
-    /* literal Length */
-    if (litLength >= MaxLL)
-    {
-        *(seqStorePtr->litLength++) = MaxLL;
-        if (litLength<255 + MaxLL)
-            *(seqStorePtr->dumps++) = (BYTE)(litLength - MaxLL);
-        else
-        {
-            *(seqStorePtr->dumps++) = 255;
-            ZSTD_writeLE32(seqStorePtr->dumps, (U32)litLength); seqStorePtr->dumps += 3;
-        }
-    }
-    else *(seqStorePtr->litLength++) = (BYTE)litLength;
-    /* match offset */
-    *(seqStorePtr->offset++) = (U32)offset;
-    /* match Length */
-    if (matchLength >= MaxML)
-    {
-        *(seqStorePtr->matchLength++) = MaxML;
-        if (matchLength < 255+MaxML)
-            *(seqStorePtr->dumps++) = (BYTE)(matchLength - MaxML);
-        else
-        {
-            *(seqStorePtr->dumps++) = 255;
-            ZSTD_writeLE32(seqStorePtr->dumps, (U32)matchLength); seqStorePtr->dumps+=3;
-        }
-    }
-    else *(seqStorePtr->matchLength++) = (BYTE)matchLength;
-//static const U32 hashMask = (1<<HASH_LOG)-1;
-//static const U64 prime5bytes =         889523592379ULL;
-//static const U64 prime6bytes =      227718039650203ULL;
-static const U64 prime7bytes =    58295818150454627ULL;
-//static const U64 prime8bytes = 14923729446516375013ULL;
-//static U32   ZSTD_hashPtr(const void* p) { return (U32) _bextr_u64(*(U64*)p * prime7bytes, (56-HASH_LOG), HASH_LOG); }
-//static U32   ZSTD_hashPtr(const void* p) { return ( (*(U64*)p * prime7bytes) << 8 >> (64-HASH_LOG)); }
-//static U32   ZSTD_hashPtr(const void* p) { return ( (*(U64*)p * prime7bytes) >> (56-HASH_LOG)) & ((1<<HASH_LOG)-1); }
-//static U32   ZSTD_hashPtr(const void* p) { return ( ((*(U64*)p & 0xFFFFFFFFFFFFFF) * prime7bytes) >> (64-HASH_LOG)); }
-//static U32   ZSTD_hashPtr(const void* p) { return ( (*(U64*)p * prime8bytes) >> (64-HASH_LOG)); }
-static U32   ZSTD_hashPtr(const void* p) { return ( (*(U64*)p * prime7bytes) >> (56-HASH_LOG)) & HASH_MASK; }
-//static U32   ZSTD_hashPtr(const void* p) { return ( (*(U64*)p * prime6bytes) >> (48-HASH_LOG)) & HASH_MASK; }
-//static U32   ZSTD_hashPtr(const void* p) { return ( (*(U64*)p * prime5bytes) >> (40-HASH_LOG)) & HASH_MASK; }
-//static U32   ZSTD_hashPtr(const void* p) { return ( (*(U32*)p * KNUTH) >> (32-HASH_LOG)); }
-static void  ZSTD_addPtr(U32* table, const BYTE* p, const BYTE* start) { table[ZSTD_hashPtr(p)] = (U32)(p-start); }
-static const BYTE* ZSTD_updateMatch(U32* table, const BYTE* p, const BYTE* start)
-    U32 h = ZSTD_hashPtr(p);
-    const BYTE* r;
-    r = table[h] + start;
-    //table[h] = (U32)(p - start);
-    ZSTD_addPtr(table, p, start);
-    return r;
-static int ZSTD_checkMatch(const BYTE* match, const BYTE* ip)
-    return ZSTD_read32(match) == ZSTD_read32(ip);
-static size_t ZSTD_compressBlock(void* cctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
-    cctxi_t* ctx = (cctxi_t*) cctx;
-    U32*  HashTable = (U32*)(ctx->hashTable);
-    seqStore_t* seqStorePtr = &(ctx->seqStore);
-    const BYTE* const base = ctx->base;
-    const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart + 1;
-    const BYTE* anchor = istart;
-    const BYTE* const iend = istart + srcSize;
-    const BYTE* const ilimit = iend - 16;
-    size_t prevOffset=0, offset=0;
-    size_t lastLLSize;
-    /* init */
-    ZSTD_resetSeqStore(seqStorePtr);
-    /* Main Search Loop */
-    while (ip < ilimit)
-    {
-        const BYTE* match = (BYTE*) ZSTD_updateMatch(HashTable, ip, base);
-        if (!ZSTD_checkMatch(match,ip)) { ip += ((ip-anchor) >> g_searchStrength) + 1; continue; }
-        /* catch up */
-        while ((ip>anchor) && (match>base) && (ip[-1] == match[-1])) { ip--; match--; }
-        {
-            size_t litLength = ip-anchor;
-            size_t matchLength = ZSTD_count(ip+MINMATCH, match+MINMATCH, iend);
-            size_t offsetCode;
-            if (litLength) prevOffset = offset;
-            offsetCode = ip-match;
-            if (offsetCode == prevOffset) offsetCode = 0;
-            prevOffset = offset;
-            offset = ip-match;
-            ZSTD_storeSeq(seqStorePtr, litLength, anchor, offsetCode, matchLength);
-            /* Fill Table */
-            ZSTD_addPtr(HashTable, ip+1, base);
-            ip += matchLength + MINMATCH;
-            if (ip<=iend-8) ZSTD_addPtr(HashTable, ip-2, base);
-            anchor = ip;
-        }
-    }
-    /* Last Literals */
-    lastLLSize = iend - anchor;
-    memcpy(seqStorePtr->lit, anchor, lastLLSize);
-    seqStorePtr->lit += lastLLSize;
-    /* Finale compression stage */
-    return ZSTD_compressSequences((BYTE*)dst, maxDstSize,
-                                  seqStorePtr, lastLLSize, srcSize);
-size_t ZSTD_compressBegin(ZSTD_cctx_t ctx, void* dst, size_t maxDstSize)
-    /* Sanity check */
-    if (maxDstSize < ZSTD_frameHeaderSize) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
-    /* Init */
-    ZSTD_resetCCtx(ctx);
-    /* Write Header */
-    ZSTD_writeBE32(dst, ZSTD_magicNumber);
-    return ZSTD_frameHeaderSize;
-/* this should be auto-vectorized by compiler */
-static void ZSTD_scaleDownCtx(void* cctx, const U32 limit)
-    cctxi_t* ctx = (cctxi_t*) cctx;
-    int i;
-#if defined(__AVX2__)   /* <immintrin.h> */
-    /* AVX2 version */
-    __m256i* h = ctx->hashTable;
-    const __m256i limit8 = _mm256_set1_epi32(limit);
-    for (i=0; i<(HASH_TABLESIZE>>3); i++)
-    {
-        __m256i src =_mm256_loadu_si256((const __m256i*)(h+i));
-  const __m256i dec = _mm256_min_epu32(src, limit8);
-                src = _mm256_sub_epi32(src, dec);
-        _mm256_storeu_si256((__m256i*)(h+i), src);
-    }
-    U32* h = ctx->hashTable;
-    for (i=0; i<HASH_TABLESIZE; ++i)
-    {
-        U32 dec;
-        if (h[i] > limit) dec = limit; else dec = h[i];
-        h[i] -= dec;
-    }
-/* this should be auto-vectorized by compiler */
-static void ZSTD_limitCtx(void* cctx, const U32 limit)
-    cctxi_t* ctx = (cctxi_t*) cctx;
-    int i;
-    if (limit > g_maxLimit)
-    {
-        ZSTD_scaleDownCtx(cctx, limit);
-        ctx->base += limit;
-        ctx->current -= limit;
-        ctx->nextUpdate -= limit;
-        return;
-    }
-#if defined(__AVX2__)   /* <immintrin.h> */
-    /* AVX2 version */
-    {
-        __m256i* h = ctx->hashTable;
-        const __m256i limit8 = _mm256_set1_epi32(limit);
-        //printf("Address h : %0X\n", (U32)h);    // address test
-        for (i=0; i<(HASH_TABLESIZE>>3); i++)
-        {
-            __m256i src =_mm256_loadu_si256((const __m256i*)(h+i));   // Unfortunately, clang doesn't guarantee 32-bytes alignment
-                    src = _mm256_max_epu32(src, limit8);
-            _mm256_storeu_si256((__m256i*)(h+i), src);
-        }
-    }
-    {
-        U32* h = (U32*)(ctx->hashTable);
-        for (i=0; i<HASH_TABLESIZE; ++i)
-        {
-            if (h[i] < limit) h[i] = limit;
-        }
-    }
-size_t ZSTD_compressContinue(ZSTD_cctx_t cctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
-    cctxi_t* ctx = (cctxi_t*) cctx;
-    const BYTE* const istart = (const BYTE* const)src;
-    const BYTE* ip = istart;
-    BYTE* const ostart = (BYTE* const)dst;
-    BYTE* op = ostart;
-    const U32 updateRate = 2 * BLOCKSIZE;
-    /*  Init */
-    if (ctx->base==NULL)
-        ctx->base = (const BYTE*)src, ctx->current=0, ctx->nextUpdate = g_maxDistance;
-    if (src != ctx->base + ctx->current)   /* not contiguous */
-    {
-            ZSTD_resetCCtx(ctx);
-            ctx->base = (const BYTE*)src;
-            ctx->current = 0;
-    }
-    ctx->current += (U32)srcSize;
-    while (srcSize)
-    {
-        size_t cSize;
-        size_t blockSize = BLOCKSIZE;
-        if (blockSize > srcSize) blockSize = srcSize;
-        /* update hash table */
-        if (g_maxDistance <= BLOCKSIZE)   /* static test => all blocks are independent */
-        {
-            ZSTD_resetCCtx(ctx);
-            ctx->base = ip;
-            ctx->current=0;
-        }
-        else if (ip >= ctx->base + ctx->nextUpdate)
-        {
-            ctx->nextUpdate += updateRate;
-            ZSTD_limitCtx(ctx, ctx->nextUpdate - g_maxDistance);
-        }
-        /* compress */
-        if (maxDstSize < ZSTD_blockHeaderSize) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
-        cSize = ZSTD_compressBlock(ctx, op+ZSTD_blockHeaderSize, maxDstSize-ZSTD_blockHeaderSize, ip, blockSize);
-        if (cSize == 0)
-        {
-            cSize = ZSTD_noCompressBlock(op, maxDstSize, ip, blockSize);   /* block is not compressible */
-            if (ZSTD_isError(cSize)) return cSize;
-        }
-        else
-        {
-            if (ZSTD_isError(cSize)) return cSize;
-            op[0] = (BYTE)(cSize>>16);
-            op[1] = (BYTE)(cSize>>8);
-            op[2] = (BYTE)cSize;
-            op[0] += (BYTE)(bt_compressed << 6); /* is a compressed block */
-            cSize += 3;
-        }
-        op += cSize;
-        maxDstSize -= cSize;
-        ip += blockSize;
-        srcSize -= blockSize;
-    }
-    return op-ostart;
-size_t ZSTD_compressEnd(ZSTD_cctx_t ctx, void* dst, size_t maxDstSize)
-    BYTE* op = (BYTE*)dst;
-    /* Sanity check */
-    (void)ctx;
-    if (maxDstSize < ZSTD_blockHeaderSize) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
-    /* End of frame */
-    op[0] = (BYTE)(bt_end << 6);
-    op[1] = 0;
-    op[2] = 0;
-    return 3;
-static size_t ZSTD_compressCCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
-    BYTE* const ostart = (BYTE* const)dst;
-    BYTE* op = ostart;
-    /* Header */
-    {
-        size_t headerSize = ZSTD_compressBegin(ctx, dst, maxDstSize);
-        if(ZSTD_isError(headerSize)) return headerSize;
-        op += headerSize;
-        maxDstSize -= headerSize;
-    }
-    /* Compression */
-    {
-        size_t cSize = ZSTD_compressContinue(ctx, op, maxDstSize, src, srcSize);
-        if (ZSTD_isError(cSize)) return cSize;
-        op += cSize;
-        maxDstSize -= cSize;
-    }
-    /* Close frame */
-    {
-        size_t endSize = ZSTD_compressEnd(ctx, op, maxDstSize);
-        if(ZSTD_isError(endSize)) return endSize;
-        op += endSize;
-    }
-    return (op - ostart);
-size_t ZSTD_compress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
-    void* ctx;
-    size_t r;
-    ctx = ZSTD_createCCtx();
-    r = ZSTD_compressCCtx(ctx, dst, maxDstSize, src, srcSize);
-    ZSTD_freeCCtx(ctx);
-    return r;
-*   Decompression code
-size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
-    const BYTE* const in = (const BYTE* const)src;
-    BYTE headerFlags;
-    U32 cSize;
-    if (srcSize < 3) return (size_t)-ZSTD_ERROR_wrongSrcSize;
-    headerFlags = *in;
-    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
-    bpPtr->blockType = (blockType_t)(headerFlags >> 6);
-    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
-    if (bpPtr->blockType == bt_end) return 0;
-    if (bpPtr->blockType == bt_rle) return 1;
-    return cSize;
-static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
-    if (srcSize > maxDstSize) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
-    memcpy(dst, src, srcSize);
-    return srcSize;
-/* force inline : 'fast' really needs to be evaluated at compile time */
-FORCE_INLINE size_t ZSTD_decompressLiterals_usingDTable_generic(
-                       void* const dst, size_t maxDstSize,
-                 const void* src, size_t srcSize,
-                 const void* DTable, U32 fast)
-    BYTE* op = (BYTE*) dst;
-    BYTE* const olimit = op;
-    BYTE* const oend = op + maxDstSize;
-    FSE_DStream_t bitD;
-    FSE_DState_t state1, state2;
-    size_t errorCode;
-    /* Init */
-    errorCode = FSE_initDStream(&bitD, src, srcSize);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    FSE_initDState(&state1, &bitD, DTable);
-    FSE_initDState(&state2, &bitD, DTable);
-    op = oend;
-    /* 2-4 symbols per loop */
-    while (!FSE_reloadDStream(&bitD) && (op>olimit+3))
-    {
-        *--op = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-        if (LitFSELog*2+7 > sizeof(size_t)*8)    /* This test must be static */
-            FSE_reloadDStream(&bitD);
-        *--op = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-        if (LitFSELog*4+7 < sizeof(size_t)*8)    /* This test must be static */
-        {
-            *--op = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-            *--op = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-        }
-    }
-    /* tail */
-    while (1)
-    {
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==olimit) || (FSE_endOfDState(&state1) && FSE_endOfDStream(&bitD)) )
-            break;
-        *--op = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==olimit) || (FSE_endOfDState(&state2) && FSE_endOfDStream(&bitD)) )
-            break;
-        *--op = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-    }
-    /* end ? */
-    if (FSE_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2) )
-        return oend-op;
-    if (op==olimit) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
-    return (size_t)-ZSTD_ERROR_GENERIC;
-static size_t ZSTD_decompressLiterals_usingDTable(
-                       void* const dst, size_t maxDstSize,
-                 const void* src, size_t srcSize,
-                 const void* DTable, U32 fast)
-    if (fast) return ZSTD_decompressLiterals_usingDTable_generic(dst, maxDstSize, src, srcSize, DTable, 1);
-    return ZSTD_decompressLiterals_usingDTable_generic(dst, maxDstSize, src, srcSize, DTable, 0);
-static size_t ZSTD_decompressLiterals(void* ctx, void* dst, size_t maxDstSize,
-                                const void* src, size_t srcSize)
-    /* assumed : blockType == blockCompressed */
-    const BYTE* ip = (const BYTE*)src;
-    short norm[256];
-    void* DTable = ctx;
-    U32 maxSymbolValue = 255;
-    U32 tableLog;
-    U32 fastMode;
-    size_t errorCode;
-    if (srcSize < 2) return (size_t)-ZSTD_ERROR_wrongLBlockSize;   /* too small input size */
-    errorCode = FSE_readHeader (norm, &maxSymbolValue, &tableLog, ip, srcSize);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    ip += errorCode;
-    srcSize -= errorCode;
-    errorCode = FSE_buildDTable (DTable, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    fastMode = (U32)errorCode;
-    return ZSTD_decompressLiterals_usingDTable (dst, maxDstSize, ip, srcSize, DTable, fastMode);
-size_t ZSTD_decodeLiteralsBlock(void* ctx,
-                                void* dst, size_t maxDstSize,
-                          const BYTE** litPtr,
-                          const void* src, size_t srcSize)
-    const BYTE* const istart = (const BYTE* const)src;
-    const BYTE* ip = istart;
-    BYTE* const ostart = (BYTE* const)dst;
-    BYTE* const oend = ostart + maxDstSize;
-    blockProperties_t litbp;
-    size_t litcSize = ZSTD_getcBlockSize(src, srcSize, &litbp);
-    if (ZSTD_isError(litcSize)) return litcSize;
-    if (litcSize > srcSize - ZSTD_blockHeaderSize) return (size_t)-ZSTD_ERROR_wrongLBlockSize;
-    ip += ZSTD_blockHeaderSize;
-    switch(litbp.blockType)
-    {
-    case bt_raw: *litPtr = ip; ip+= litcSize; break;
-    case bt_rle:
-        {
-            size_t rleSize = litbp.origSize;
-            memset(oend - rleSize, *ip, rleSize);
-            *litPtr = oend - rleSize;
-            ip++;
-            break;
-        }
-    case bt_compressed:
-        {
-            size_t cSize = ZSTD_decompressLiterals(ctx, dst, maxDstSize, ip, litcSize);
-            if (ZSTD_isError(cSize)) return cSize;
-            *litPtr = oend - cSize;
-            ip += litcSize;
-            break;
-        }
-    default:
-        return (size_t)-ZSTD_ERROR_GENERIC;
-    }
-    return ip-istart;
-size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
-                               void* DTableLL, void* DTableML, void* DTableOffb,
-                         const void* src, size_t srcSize)
-    const BYTE* const istart = (const BYTE* const)src;
-    const BYTE* ip = istart;
-    const BYTE* const iend = istart + srcSize;
-    U32 LLtype, Offtype, MLtype;
-    U32 LLlog, Offlog, MLlog;
-    size_t dumpsLength;
-    /* SeqHead */
-    ip += ZSTD_readProgressive(lastLLPtr, ip);
-    LLtype  = *ip >> 6;
-    Offtype = (*ip >> 4) & 3;
-    MLtype  = (*ip >> 2) & 3;
-    if (*ip & 2)
-    {
-        dumpsLength  = ip[2];
-        dumpsLength += ip[1] << 8;
-        ip += 3;
-    }
-    else
-    {
-        dumpsLength  = ip[1];
-        dumpsLength += (ip[0] & 1) << 8;
-        ip += 2;
-    }
-    *dumpsPtr = ip;
-    ip += dumpsLength;
-    /* sequences */
-    {
-        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL and MaxOff */
-        size_t headerSize;
-        /* Build DTables */
-        switch(LLtype)
-        {
-        U32 max;
-        case bt_rle :
-            LLlog = 0;
-            FSE_buildDTable_rle(DTableLL, *ip++); break;
-        case bt_raw :
-            LLlog = LLbits;
-            FSE_buildDTable_raw(DTableLL, LLbits); break;
-        default :
-            max = MaxLL;
-            headerSize = FSE_readHeader(norm, &max, &LLlog, ip, iend-ip);
-            if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
-            ip += headerSize;
-            FSE_buildDTable(DTableLL, norm, max, LLlog);
-        }
-        switch(Offtype)
-        {
-        U32 max;
-        case bt_rle :
-            Offlog = 0;
-            FSE_buildDTable_rle(DTableOffb, *ip++); break;
-        case bt_raw :
-            Offlog = Offbits;
-            FSE_buildDTable_raw(DTableOffb, Offbits); break;
-        default :
-            max = MaxOff;
-            headerSize = FSE_readHeader(norm, &max, &Offlog, ip, iend-ip);
-            if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
-            ip += headerSize;
-            FSE_buildDTable(DTableOffb, norm, max, Offlog);
-        }
-        switch(MLtype)
-        {
-        U32 max;
-        case bt_rle :
-            MLlog = 0;
-            FSE_buildDTable_rle(DTableML, *ip++); break;
-        case bt_raw :
-            MLlog = MLbits;
-            FSE_buildDTable_raw(DTableML, MLbits); break;
-        default :
-            max = MaxML;
-            headerSize = FSE_readHeader(norm, &max, &MLlog, ip, iend-ip);
-            if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
-            ip += headerSize;
-            FSE_buildDTable(DTableML, norm, max, MLlog);
-        }
-    }
-    return ip-istart;
-#define ZSTD_prefetch(p) { const BYTE pByte = *(volatile const BYTE*)p; }
-FORCE_INLINE size_t ZSTD_decompressBlock(void* ctx, void* dst, size_t maxDstSize,
-                             const void* src, size_t srcSize)
-    const BYTE* ip = (const BYTE*)src;
-    const BYTE* const iend = ip + srcSize;
-    BYTE* const ostart = (BYTE* const)dst;
-    BYTE* op = ostart;
-    BYTE* const oend = ostart + maxDstSize;
-    size_t errorCode;
-    size_t lastLLSize;
-    const BYTE* dumps;
-    const BYTE* litPtr;
-    const BYTE* litEnd;
-    const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};   /* added */
-    const size_t dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
-    void* DTableML = ctx;
-    void* DTableLL = ((U32*)ctx) + FSE_DTABLE_SIZE_U32(MLFSELog);
-    void* DTableOffb = ((U32*)DTableLL) + FSE_DTABLE_SIZE_U32(LLFSELog);
-    /* blockType == blockCompressed, srcSize is trusted */
-    /* literal sub-block */
-    errorCode = ZSTD_decodeLiteralsBlock(ctx, dst, maxDstSize, &litPtr, src, srcSize);
-    if (ZSTD_isError(errorCode)) return errorCode;
-    ip += errorCode;
-    /* Build Decoding Tables */
-    errorCode = ZSTD_decodeSeqHeaders(&lastLLSize, &dumps,
-                                      DTableLL, DTableML, DTableOffb,
-                                      ip, iend-ip);
-    if (ZSTD_isError(errorCode)) return errorCode;
-    /* end pos */
-    if ((litPtr>=ostart) && (litPtr<=oend))
-        litEnd = oend - lastLLSize;
-    else
-        litEnd = ip - lastLLSize;
-    ip += errorCode;
-    /* decompression */
-    {
-        FSE_DStream_t DStream;
-        FSE_DState_t stateLL, stateOffb, stateML;
-        size_t prevOffset = 0, offset = 0;
-        size_t qutt=0;
-        FSE_initDStream(&DStream, ip, iend-ip);
-        FSE_initDState(&stateLL, &DStream, DTableLL);
-        FSE_initDState(&stateOffb, &DStream, DTableOffb);
-        FSE_initDState(&stateML, &DStream, DTableML);
-        while (FSE_reloadDStream(&DStream)<2)
-        {
-            U32 nbBits, offsetCode;
-            const BYTE* match;
-            size_t litLength;
-            size_t matchLength;
-            size_t newOffset;
-            /* Literals */
-            litLength = FSE_decodeSymbol(&stateLL, &DStream);
-            if (litLength) prevOffset = offset;
-            if (litLength == MaxLL)
-            {
-                BYTE add = *dumps++;
-                if (add < 255) litLength += add;
-                else
-                {
-                    //litLength = (*(U32*)dumps) & 0xFFFFFF;
-                    litLength = ZSTD_readLE32(dumps) & 0xFFFFFF;
-                    dumps += 3;
-                }
-            }
-            if (((size_t)(litPtr - op) < 8) || ((size_t)(oend-(litPtr+litLength)) < 8))
-                memmove(op, litPtr, litLength);   /* overwrite risk */
-            else
-                ZSTD_wildcopy(op, litPtr, litLength);
-            op += litLength;
-            litPtr += litLength;
-            /* Offset */
-            offsetCode = FSE_decodeSymbol(&stateOffb, &DStream);
-            if (ZSTD_32bits()) FSE_reloadDStream(&DStream);
-            nbBits = offsetCode - 1;
-            if (offsetCode==0) nbBits = 0;   /* cmove */
-            newOffset = FSE_readBits(&DStream, nbBits);
-            if (ZSTD_32bits()) FSE_reloadDStream(&DStream);
-            newOffset += (size_t)1 << nbBits;
-            if (offsetCode==0) newOffset = prevOffset;
-            match = op - newOffset;
-            prevOffset = offset;
-            offset = newOffset;
-            /* MatchLength */
-            matchLength = FSE_decodeSymbol(&stateML, &DStream);
-            if (matchLength == MaxML)
-            {
-                BYTE add = *dumps++;
-                if (add < 255) matchLength += add;
-                else
-                {
-                    matchLength = ZSTD_readLE32(dumps) & 0xFFFFFF;
-                    dumps += 3;
-                }
-            }
-            matchLength += MINMATCH;
-            /* copy Match */
-            {
-                BYTE* const endMatch = op + matchLength;
-                U64 saved[2];
-                if ((size_t)(litPtr - endMatch) < 12)
-                {
-                    qutt = endMatch + 12 - litPtr;
-                    if ((litPtr + qutt) > oend) qutt = oend-litPtr;
-                    memcpy(saved, litPtr, qutt);
-                }
-                if (offset < 8)
-                {
-                    const size_t dec64 = dec64table[offset];
-                    op[0] = match[0];
-                    op[1] = match[1];
-                    op[2] = match[2];
-                    op[3] = match[3];
-                    match += dec32table[offset];
-                    ZSTD_copy4(op+4, match);
-                    match -= dec64;
-                } else { ZSTD_copy8(op, match); }
-                if (endMatch > oend-12)
-                {
-                    if (op < oend-16)
-                    {
-                        ZSTD_wildcopy(op+8, match+8, (oend-8) - (op+8));
-                        match += (oend-8) - op;
-                        op = oend-8;
-                    }
-                    while (op<endMatch) *op++ = *match++;
-                }
-                else
-                    ZSTD_wildcopy(op+8, match+8, matchLength-8);   /* works even if matchLength < 8 */
-                op = endMatch;
-                if ((size_t)(litPtr - endMatch) < 12)
-                    memcpy((void*)litPtr, saved, qutt);
-            }
-        }
-        /* check if reached exact end */
-        if (FSE_reloadDStream(&DStream) > 2) return (size_t)-ZSTD_ERROR_GENERIC;   /* requested too much : data is corrupted */
-        if (!FSE_endOfDState(&stateLL) && !FSE_endOfDState(&stateML) && !FSE_endOfDState(&stateOffb)) goto _another_round;   /* some ultra-compressible sequence remain ! */
-        if (litPtr != litEnd) goto _another_round;   /* literals not entirely spent */
-        /* last literal segment */
-        if (op != litPtr) memmove(op, litPtr, lastLLSize);
-        op += lastLLSize;
-    }
-    return op-ostart;
-static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
-    const BYTE* ip = (const BYTE*)src;
-    const BYTE* iend = ip + srcSize;
-    BYTE* const ostart = (BYTE* const)dst;
-    BYTE* op = ostart;
-    BYTE* const oend = ostart + maxDstSize;
-    size_t remainingSize = srcSize;
-    U32 magicNumber;
-    size_t errorCode=0;
-    blockProperties_t blockProperties;
-    /* Header */
-    if (srcSize < ZSTD_frameHeaderSize) return (size_t)-ZSTD_ERROR_wrongSrcSize;
-    magicNumber = ZSTD_readBE32(src);
-    if (magicNumber != ZSTD_magicNumber) return (size_t)-ZSTD_ERROR_wrongMagicNumber;
-    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
-    while (1)
-    {
-        size_t blockSize = ZSTD_getcBlockSize(ip, iend-ip, &blockProperties);
-        if (ZSTD_isError(blockSize))
-            return blockSize;
-        ip += ZSTD_blockHeaderSize;
-        remainingSize -= ZSTD_blockHeaderSize;
-        if (ip+blockSize > iend)
-            return (size_t)-ZSTD_ERROR_wrongSrcSize;
-        switch(blockProperties.blockType)
-        {
-        case bt_compressed:
-            errorCode = ZSTD_decompressBlock(ctx, op, oend-op, ip, blockSize);
-            break;
-        case bt_raw :
-            errorCode = ZSTD_copyUncompressedBlock(op, oend-op, ip, blockSize);
-            break;
-        case bt_rle :
-            return (size_t)-ZSTD_ERROR_GENERIC;   /* not yet handled */
-            break;
-        case bt_end :
-            /* end of frame */
-            if (remainingSize) return (size_t)-ZSTD_ERROR_wrongSrcSize;
-            break;
-        default:
-            return (size_t)-ZSTD_ERROR_GENERIC;
-        }
-        if (blockSize == 0) break;   /* bt_end */
-        if (ZSTD_isError(errorCode)) return errorCode;
-        op += errorCode;
-        ip += blockSize;
-        remainingSize -= blockSize;
-    }
-    return op-ostart;
-size_t ZSTD_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
-    return ZSTD_decompressDCtx(ctx, dst, maxDstSize, src, srcSize);
-*  Streaming Decompression API
-typedef struct
-    size_t expected;
-    blockType_t bType;
-    U32 phase;
-} dctx_t;
-ZSTD_dctx_t ZSTD_createDCtx(void)
-    dctx_t* dctx = (dctx_t*)malloc(sizeof(dctx_t));
-    dctx->expected = ZSTD_frameHeaderSize;
-    dctx->phase = 0;
-    return (ZSTD_dctx_t)dctx;
-size_t ZSTD_freeDCtx(ZSTD_dctx_t dctx)
-    free(dctx);
-    return 0;
-size_t ZSTD_nextSrcSizeToDecompress(ZSTD_dctx_t dctx)
-    return ((dctx_t*)dctx)->expected;
-size_t ZSTD_decompressContinue(ZSTD_dctx_t dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
-    dctx_t* ctx = (dctx_t*)dctx;
-    /* Sanity check */
-    if (srcSize != ctx->expected) return (size_t)-ZSTD_ERROR_wrongSrcSize;
-    /* Decompress : frame header */
-    if (ctx->phase == 0)
-    {
-        /* Check frame magic header */
-        U32 magicNumber = ZSTD_readBE32(src);
-        if (magicNumber != ZSTD_magicNumber) return (size_t)-ZSTD_ERROR_wrongMagicNumber;
-        ctx->phase = 1;
-        ctx->expected = ZSTD_blockHeaderSize;
-        return 0;
-    }
-    /* Decompress : block header */
-    if (ctx->phase == 1)
-    {
-        blockProperties_t bp;
-        size_t blockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
-        if (ZSTD_isError(blockSize)) return blockSize;
-        if (bp.blockType == bt_end)
-        {
-            ctx->expected = 0;
-            ctx->phase = 0;
-        }
-        else
-        {
-            ctx->expected = blockSize;
-            ctx->bType = bp.blockType;
-            ctx->phase = 2;
-        }
-        return 0;
-    }
-    /* Decompress : block content */
-    {
-        size_t rSize;
-        switch(ctx->bType)
-        {
-        case bt_compressed:
-            rSize = ZSTD_decompressBlock(ctx, dst, maxDstSize, src, srcSize);
-            break;
-        case bt_raw :
-            rSize = ZSTD_copyUncompressedBlock(dst, maxDstSize, src, srcSize);
-            break;
-        case bt_rle :
-            return (size_t)-ZSTD_ERROR_GENERIC;   /* not yet handled */
-            break;
-        case bt_end :   /* should never happen (filtered at phase 1) */
-            rSize = 0;
-            break;
-        default:
-            return (size_t)-ZSTD_ERROR_GENERIC;
-        }
-        ctx->phase = 1;
-        ctx->expected = ZSTD_blockHeaderSize;
-        return rSize;
-    }
-/* <<<<< zstd.c EOF */
-typedef struct sszstdfilter sszstdfilter;
-struct sszstdfilter {
-	void *ctx;
-} sspacked;
-static int
-ss_zstdfilter_init(ssfilter *f, va_list args ssunused)
-	sszstdfilter *z = (sszstdfilter*)f->priv;
-	switch (f->op) {
-	case SS_FINPUT:
-		z->ctx = ZSTD_createCCtx();
-		if (ssunlikely(z->ctx == NULL))
-			return -1;
-		break;	
-	case SS_FOUTPUT:
-		z->ctx = NULL;
-		break;	
-	}
-	return 0;
-static int
-ss_zstdfilter_free(ssfilter *f)
-	sszstdfilter *z = (sszstdfilter*)f->priv;
-	switch (f->op) {
-	case SS_FINPUT:
-		ZSTD_freeCCtx(z->ctx);
-		break;	
-	case SS_FOUTPUT:
-		break;	
-	}
-	return 0;
-static int
-ss_zstdfilter_start(ssfilter *f, ssbuf *dest)
-	sszstdfilter *z = (sszstdfilter*)f->priv;
-	int rc;
-	size_t block;
-	size_t sz;
-	switch (f->op) {
-	case SS_FINPUT:;
-		block = ZSTD_frameHeaderSize;
-		rc = ss_bufensure(dest, f->a, block);
-		if (ssunlikely(rc == -1))
-			return -1;
-		sz = ZSTD_compressBegin(z->ctx, dest->p, block);
-		if (ssunlikely(ZSTD_isError(sz)))
-			return -1;
-		ss_bufadvance(dest, sz);
-		break;	
-	case SS_FOUTPUT:
-		/* do nothing */
-		break;
-	}
-	return 0;
-static int
-ss_zstdfilter_next(ssfilter *f, ssbuf *dest, char *buf, int size)
+static int
+ss_zstdfilter_next(ssfilter *f, ssbuf *dest, char *buf, int size)
 	sszstdfilter *z = (sszstdfilter*)f->priv;
 	int rc;
@@ -7346,7 +3468,7 @@ ss_zstdfilter_next(ssfilter *f, ssbuf *dest, char *buf, int size)
 		if (ssunlikely(ZSTD_isError(sz)))
 			return -1;
 		ss_bufadvance(dest, sz);
-		break;	
+		break;
 	case SS_FOUTPUT:
 		/* do a single-pass decompression.
diff --git a/third_party/zstd b/third_party/zstd
new file mode 160000
index 0000000000000000000000000000000000000000..d0407789f6b926222ac83183010918b090cf163d
--- /dev/null
+++ b/third_party/zstd
@@ -0,0 +1 @@
+Subproject commit d0407789f6b926222ac83183010918b090cf163d