From 0c3885a0070a78342c8ec64a8dd56eba7ff6f09a Mon Sep 17 00:00:00 2001 From: Konstantin Osipov <kostja@tarantool.org> Date: Sat, 2 Feb 2013 23:29:35 +0400 Subject: [PATCH] Replace murmur hash 2 with murmur hash 3. --- client/tarantool_checksum/CMakeLists.txt | 3 + client/tarantool_checksum/tc_generate.c | 4 +- client/tarantool_checksum/tc_verify.c | 1 - cmake/BuildMisc.cmake | 1 + include/assoc.h | 4 +- test/big/iterator.result | 6 +- test/big/sql.result | 10 +- test/box/socket.result | 8 +- test/unit/bit.c | 1 + third_party/PMurHash.c | 317 +++++++++++++++++++++++ third_party/PMurHash.h | 64 +++++ third_party/README | 6 + third_party/murmur_hash2.c | 64 ----- 13 files changed, 409 insertions(+), 80 deletions(-) create mode 100644 third_party/PMurHash.c create mode 100644 third_party/PMurHash.h delete mode 100644 third_party/murmur_hash2.c diff --git a/client/tarantool_checksum/CMakeLists.txt b/client/tarantool_checksum/CMakeLists.txt index e24280ad63..fadc74533c 100644 --- a/client/tarantool_checksum/CMakeLists.txt +++ b/client/tarantool_checksum/CMakeLists.txt @@ -16,6 +16,9 @@ list(APPEND util_checksum_sources ${CMAKE_SOURCE_DIR}/cfg/tarantool_box_cfg.c ${CMAKE_SOURCE_DIR}/cfg/prscfg.c) +list(APPEND util_checksum_sources + ${CMAKE_SOURCE_DIR}/third_party/PMurHash.c) + set_source_files_compile_flags( ${util_checksum_sources}) add_executable(${util_checksum} ${util_checksum_sources}) diff --git a/client/tarantool_checksum/tc_generate.c b/client/tarantool_checksum/tc_generate.c index 3ee329393b..2683a21792 100644 --- a/client/tarantool_checksum/tc_generate.c +++ b/client/tarantool_checksum/tc_generate.c @@ -43,7 +43,7 @@ #include <cfg/prscfg.h> #include <cfg/tarantool_box_cfg.h> -#include <third_party/murmur_hash2.c> +#include <third_party/PMurHash.h> #include <third_party/crc32.h> #include "tc_key.h" @@ -74,7 +74,7 @@ search_hash(const struct tc_key *k, struct tc_space *s) break; } case TC_SPACE_KEY_STRING: - h = MurmurHash2(TC_KEY_DATA(k, i), TC_KEY_SIZE(k, i), h); + h = PMurHash32(h, TC_KEY_DATA(k, i), TC_KEY_SIZE(k, i)); break; case TC_SPACE_KEY_UNKNOWN: assert(1); diff --git a/client/tarantool_checksum/tc_verify.c b/client/tarantool_checksum/tc_verify.c index 18321a55a6..aac9ba84a6 100644 --- a/client/tarantool_checksum/tc_verify.c +++ b/client/tarantool_checksum/tc_verify.c @@ -44,7 +44,6 @@ #include <cfg/prscfg.h> #include <cfg/tarantool_box_cfg.h> -#include <third_party/murmur_hash2.c> #include <third_party/crc32.h> #include "tc_key.h" diff --git a/cmake/BuildMisc.cmake b/cmake/BuildMisc.cmake index 8f90c771b6..e8975a5021 100644 --- a/cmake/BuildMisc.cmake +++ b/cmake/BuildMisc.cmake @@ -5,6 +5,7 @@ macro(libmisc_build) ${PROJECT_SOURCE_DIR}/third_party/crc32.c ${PROJECT_SOURCE_DIR}/third_party/proctitle.c ${PROJECT_SOURCE_DIR}/third_party/qsort_arg.c + ${PROJECT_SOURCE_DIR}/third_party/PMurHash.c ) if (NOT HAVE_MEMMEM) diff --git a/include/assoc.h b/include/assoc.h index 49db6f4b14..497e12ecfc 100644 --- a/include/assoc.h +++ b/include/assoc.h @@ -82,7 +82,7 @@ static inline int lstrcmp(const void *a, const void *b) return bl - al; return memcmp(a, b, al); } -#include <third_party/murmur_hash2.c> +#include <third_party/PMurHash.h> #define mh_name _lstrptr struct mh_lstrptr_node_t { const void *key; @@ -97,7 +97,7 @@ mh_strptr_hash(const mh_node_t *a, mh_hash_arg_t arg) { (void) arg; const void *_k = (a->key); const u32 l = load_varint32(&_k); - return (u32) MurmurHash2(_k, l, 13); + return PMurHash32(13, _k, l); } #define mh_hash(a, arg) mh_strptr_hash(a, arg) #define mh_eq_arg_t void * diff --git a/test/big/iterator.result b/test/big/iterator.result index 117481a34b..adb6172424 100644 --- a/test/big/iterator.result +++ b/test/big/iterator.result @@ -830,9 +830,9 @@ lua iterate(20, 4, 0, 1, box.index.GE, 'pid_001') --- sorted output $pid_001$ -$pid_002$ -$pid_005$ -$pid_017$ +$pid_007$ +$pid_011$ +$pid_019$ $pid_023$ ... lua iterate(20, 4, 0, 1, box.index.GE, 'pid_999') diff --git a/test/big/sql.result b/test/big/sql.result index 1fe0688f60..222809b43f 100644 --- a/test/big/sql.result +++ b/test/big/sql.result @@ -84,9 +84,9 @@ insert into t1 values ('key3', 'part1', 'part2_b') Insert OK, 1 row affected lua for k, v in box.space[1]:pairs() do print(v) end --- -830039403: {'part1', 'part2'} -863593835: {'part1', 'part2_b'} 846816619: {'part1', 'part2_a'} +863593835: {'part1', 'part2_b'} +830039403: {'part1', 'part2'} ... select * from t1 where k0='key1' Found 1 tuple: @@ -108,8 +108,10 @@ Found 3 tuples: [846816619, 'part1', 'part2_a'] [863593835, 'part1', 'part2_b'] call box.select_range(1, 0, 100, 'key2') -Found 1 tuple: +Found 3 tuples: +[830039403, 'part1', 'part2'] [846816619, 'part1', 'part2_a'] +[863593835, 'part1', 'part2_b'] call box.select_range(1, 1, 100, 'part1', 'part2_a') Found 2 tuples: [846816619, 'part1', 'part2_a'] @@ -336,9 +338,9 @@ insert into t4 values(3, 'Creature ') Insert OK, 1 row affected lua for k, v in box.space[4]:pairs() do print(v) end --- +2: {'Bilimbi'} 3: {'Creature '} 1: {'Aardvark '} -2: {'Bilimbi'} ... lua box.space[4].index[0].idx:min() --- diff --git a/test/box/socket.result b/test/box/socket.result index 9d0340784b..bf4be1d81b 100644 --- a/test/box/socket.result +++ b/test/box/socket.result @@ -54,13 +54,13 @@ lua s:connect('::1', '30303') --- - nil - error - - 111 - - Connection refused + - -1 + - Host name resolution failed ... lua s:error() --- - - 111 - - Connection refused + - -1 + - Host name resolution failed ... lua s:connect('127.0.0.1', '30303', 0.01) --- diff --git a/test/unit/bit.c b/test/unit/bit.c index 9b305369ef..a7cce03672 100644 --- a/test/unit/bit.c +++ b/test/unit/bit.c @@ -3,6 +3,7 @@ #include <stdio.h> #include <inttypes.h> #include <assert.h> +#include <stdlib.h> #include "unit.h" diff --git a/third_party/PMurHash.c b/third_party/PMurHash.c new file mode 100644 index 0000000000..017501264d --- /dev/null +++ b/third_party/PMurHash.c @@ -0,0 +1,317 @@ +/*----------------------------------------------------------------------------- + * MurmurHash3 was written by Austin Appleby, and is placed in the public + * domain. + * + * This implementation was written by Shane Day, and is also public domain. + * + * This is a portable ANSI C implementation of MurmurHash3_x86_32 (Murmur3A) + * with support for progressive processing. + */ + +/*----------------------------------------------------------------------------- + +If you want to understand the MurmurHash algorithm you would be much better +off reading the original source. Just point your browser at: +http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + + +What this version provides? + +1. Progressive data feeding. Useful when the entire payload to be hashed +does not fit in memory or when the data is streamed through the application. +Also useful when hashing a number of strings with a common prefix. A partial +hash of a prefix string can be generated and reused for each suffix string. + +2. Portability. Plain old C so that it should compile on any old compiler. +Both CPU endian and access-alignment neutral, but avoiding inefficient code +when possible depending on CPU capabilities. + +3. Drop in. I personally like nice self contained public domain code, making it +easy to pilfer without loads of refactoring to work properly in the existing +application code & makefile structure and mucking around with licence files. +Just copy PMurHash.h and PMurHash.c and you're ready to go. + + +How does it work? + +We can only process entire 32 bit chunks of input, except for the very end +that may be shorter. So along with the partial hash we need to give back to +the caller a carry containing up to 3 bytes that we were unable to process. +This carry also needs to record the number of bytes the carry holds. I use +the low 2 bits as a count (0..3) and the carry bytes are shifted into the +high byte in stream order. + +To handle endianess I simply use a macro that reads a uint32_t and define +that macro to be a direct read on little endian machines, a read and swap +on big endian machines, or a byte-by-byte read if the endianess is unknown. + +-----------------------------------------------------------------------------*/ + + +#include "PMurHash.h" + +/* I used ugly type names in the header to avoid potential conflicts with + * application or system typedefs & defines. Since I'm not including any more + * headers below here I can rename these so that the code reads like C99 */ +#undef uint32_t +#define uint32_t MH_UINT32 +#undef uint8_t +#define uint8_t MH_UINT8 + +/* MSVC warnings we choose to ignore */ +#if defined(_MSC_VER) + #pragma warning(disable: 4127) /* conditional expression is constant */ +#endif + +/*----------------------------------------------------------------------------- + * Endianess, misalignment capabilities and util macros + * + * The following 3 macros are defined in this section. The other macros defined + * are only needed to help derive these 3. + * + * READ_UINT32(x) Read a little endian unsigned 32-bit int + * UNALIGNED_SAFE Defined if READ_UINT32 works on non-word boundaries + * ROTL32(x,r) Rotate x left by r bits + */ + +/* Convention is to define __BYTE_ORDER == to one of these values */ +#if !defined(__BIG_ENDIAN) + #define __BIG_ENDIAN 4321 +#endif +#if !defined(__LITTLE_ENDIAN) + #define __LITTLE_ENDIAN 1234 +#endif + +/* I386 */ +#if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(i386) + #define __BYTE_ORDER __LITTLE_ENDIAN + #define UNALIGNED_SAFE +#endif + +/* gcc 'may' define __LITTLE_ENDIAN__ or __BIG_ENDIAN__ to 1 (Note the trailing __), + * or even _LITTLE_ENDIAN or _BIG_ENDIAN (Note the single _ prefix) */ +#if !defined(__BYTE_ORDER) + #if defined(__LITTLE_ENDIAN__) && __LITTLE_ENDIAN__==1 || defined(_LITTLE_ENDIAN) && _LITTLE_ENDIAN==1 + #define __BYTE_ORDER __LITTLE_ENDIAN + #elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__==1 || defined(_BIG_ENDIAN) && _BIG_ENDIAN==1 + #define __BYTE_ORDER __BIG_ENDIAN + #endif +#endif + +/* gcc (usually) defines xEL/EB macros for ARM and MIPS endianess */ +#if !defined(__BYTE_ORDER) + #if defined(__ARMEL__) || defined(__MIPSEL__) + #define __BYTE_ORDER __LITTLE_ENDIAN + #endif + #if defined(__ARMEB__) || defined(__MIPSEB__) + #define __BYTE_ORDER __BIG_ENDIAN + #endif +#endif + +/* Now find best way we can to READ_UINT32 */ +#if __BYTE_ORDER==__LITTLE_ENDIAN + /* CPU endian matches murmurhash algorithm, so read 32-bit word directly */ + #define READ_UINT32(ptr) (*((uint32_t*)(ptr))) +#elif __BYTE_ORDER==__BIG_ENDIAN + /* TODO: Add additional cases below where a compiler provided bswap32 is available */ + #if defined(__GNUC__) && (__GNUC__>4 || (__GNUC__==4 && __GNUC_MINOR__>=3)) + #define READ_UINT32(ptr) (__builtin_bswap32(*((uint32_t*)(ptr)))) + #else + /* Without a known fast bswap32 we're just as well off doing this */ + #define READ_UINT32(ptr) (ptr[0]|ptr[1]<<8|ptr[2]<<16|ptr[3]<<24) + #define UNALIGNED_SAFE + #endif +#else + /* Unknown endianess so last resort is to read individual bytes */ + #define READ_UINT32(ptr) (ptr[0]|ptr[1]<<8|ptr[2]<<16|ptr[3]<<24) + + /* Since we're not doing word-reads we can skip the messing about with realignment */ + #define UNALIGNED_SAFE +#endif + +/* Find best way to ROTL32 */ +#if defined(_MSC_VER) + #include <stdlib.h> /* Microsoft put _rotl declaration in here */ + #define ROTL32(x,r) _rotl(x,r) +#else + /* gcc recognises this code and generates a rotate instruction for CPUs with one */ + #define ROTL32(x,r) (((uint32_t)x << r) | ((uint32_t)x >> (32 - r))) +#endif + + +/*----------------------------------------------------------------------------- + * Core murmurhash algorithm macros */ + +#define C1 (0xcc9e2d51) +#define C2 (0x1b873593) + +/* This is the main processing body of the algorithm. It operates + * on each full 32-bits of input. */ +#define DOBLOCK(h1, k1) do{ \ + k1 *= C1; \ + k1 = ROTL32(k1,15); \ + k1 *= C2; \ + \ + h1 ^= k1; \ + h1 = ROTL32(h1,13); \ + h1 = h1*5+0xe6546b64; \ + }while(0) + + +/* Append unaligned bytes to carry, forcing hash churn if we have 4 bytes */ +/* cnt=bytes to process, h1=name of h1 var, c=carry, n=bytes in c, ptr/len=payload */ +#define DOBYTES(cnt, h1, c, n, ptr, len) do{ \ + int _i = cnt; \ + while(_i--) { \ + c = c>>8 | *ptr++<<24; \ + n++; len--; \ + if(n==4) { \ + DOBLOCK(h1, c); \ + n = 0; \ + } \ + } }while(0) + +/*---------------------------------------------------------------------------*/ + +/* Main hashing function. Initialise carry to 0 and h1 to 0 or an initial seed + * if wanted. Both ph1 and pcarry are required arguments. */ +void PMurHash32_Process(uint32_t *ph1, uint32_t *pcarry, const void *key, int len) +{ + uint32_t h1 = *ph1; + uint32_t c = *pcarry; + + const uint8_t *ptr = (uint8_t*)key; + const uint8_t *end; + + /* Extract carry count from low 2 bits of c value */ + int n = c & 3; + +#if defined(UNALIGNED_SAFE) + /* This CPU handles unaligned word access */ + + /* Consume any carry bytes */ + int i = (4-n) & 3; + if(i && i <= len) { + DOBYTES(i, h1, c, n, ptr, len); + } + + /* Process 32-bit chunks */ + end = ptr + len/4*4; + for( ; ptr < end ; ptr+=4) { + uint32_t k1 = READ_UINT32(ptr); + DOBLOCK(h1, k1); + } + +#else /*UNALIGNED_SAFE*/ + /* This CPU does not handle unaligned word access */ + + /* Consume enough so that the next data byte is word aligned */ + int i = -(long)ptr & 3; + if(i && i <= len) { + DOBYTES(i, h1, c, n, ptr, len); + } + + /* We're now aligned. Process in aligned blocks. Specialise for each possible carry count */ + end = ptr + len/4*4; + switch(n) { /* how many bytes in c */ + case 0: /* c=[----] w=[3210] b=[3210]=w c'=[----] */ + for( ; ptr < end ; ptr+=4) { + uint32_t k1 = READ_UINT32(ptr); + DOBLOCK(h1, k1); + } + break; + case 1: /* c=[0---] w=[4321] b=[3210]=c>>24|w<<8 c'=[4---] */ + for( ; ptr < end ; ptr+=4) { + uint32_t k1 = c>>24; + c = READ_UINT32(ptr); + k1 |= c<<8; + DOBLOCK(h1, k1); + } + break; + case 2: /* c=[10--] w=[5432] b=[3210]=c>>16|w<<16 c'=[54--] */ + for( ; ptr < end ; ptr+=4) { + uint32_t k1 = c>>16; + c = READ_UINT32(ptr); + k1 |= c<<16; + DOBLOCK(h1, k1); + } + break; + case 3: /* c=[210-] w=[6543] b=[3210]=c>>8|w<<24 c'=[654-] */ + for( ; ptr < end ; ptr+=4) { + uint32_t k1 = c>>8; + c = READ_UINT32(ptr); + k1 |= c<<24; + DOBLOCK(h1, k1); + } + } +#endif /*UNALIGNED_SAFE*/ + + /* Advance over whole 32-bit chunks, possibly leaving 1..3 bytes */ + len -= len/4*4; + + /* Append any remaining bytes into carry */ + DOBYTES(len, h1, c, n, ptr, len); + + /* Copy out new running hash and carry */ + *ph1 = h1; + *pcarry = (c & ~0xff) | n; +} + +/*---------------------------------------------------------------------------*/ + +/* Finalize a hash. To match the original Murmur3A the total_length must be provided */ +uint32_t PMurHash32_Result(uint32_t h, uint32_t carry, uint32_t total_length) +{ + uint32_t k1; + int n = carry & 3; + if(n) { + k1 = carry >> (4-n)*8; + k1 *= C1; k1 = ROTL32(k1,15); k1 *= C2; h ^= k1; + } + h ^= total_length; + + /* fmix */ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +/*---------------------------------------------------------------------------*/ + +/* Murmur3A compatable all-at-once */ +uint32_t PMurHash32(uint32_t seed, const void *key, int len) +{ + uint32_t h1=seed, carry=0; + PMurHash32_Process(&h1, &carry, key, len); + return PMurHash32_Result(h1, carry, len); +} + +/*---------------------------------------------------------------------------*/ + +/* Provide an API suitable for smhasher */ +void PMurHash32_test(const void *key, int len, uint32_t seed, void *out) +{ + uint32_t h1=seed, carry=0; + const uint8_t *ptr = (uint8_t*)key; + const uint8_t *end = ptr + len; + +#if 0 /* Exercise the progressive processing */ + while(ptr < end) { + //const uint8_t *mid = ptr + rand()%(end-ptr)+1; + const uint8_t *mid = ptr + (rand()&0xF); + mid = mid<end?mid:end; + PMurHash32_Process(&h1, &carry, ptr, mid-ptr); + ptr = mid; + } +#else + PMurHash32_Process(&h1, &carry, ptr, (int)(end-ptr)); +#endif + h1 = PMurHash32_Result(h1, carry, len); + *(uint32_t*)out = h1; +} + +/*---------------------------------------------------------------------------*/ diff --git a/third_party/PMurHash.h b/third_party/PMurHash.h new file mode 100644 index 0000000000..28ead00a7d --- /dev/null +++ b/third_party/PMurHash.h @@ -0,0 +1,64 @@ +/*----------------------------------------------------------------------------- + * MurmurHash3 was written by Austin Appleby, and is placed in the public + * domain. + * + * This implementation was written by Shane Day, and is also public domain. + * + * This is a portable ANSI C implementation of MurmurHash3_x86_32 (Murmur3A) + * with support for progressive processing. + */ + +/* ------------------------------------------------------------------------- */ +/* Determine what native type to use for uint32_t */ + +/* We can't use the name 'uint32_t' here because it will conflict with + * any version provided by the system headers or application. */ + +/* First look for special cases */ +#if defined(_MSC_VER) + #define MH_UINT32 unsigned long +#endif + +/* If the compiler says it's C99 then take its word for it */ +#if !defined(MH_UINT32) && ( \ + defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L ) + #include <stdint.h> + #define MH_UINT32 uint32_t +#endif + +/* Otherwise try testing against max value macros from limit.h */ +#if !defined(MH_UINT32) + #include <limits.h> + #if (USHRT_MAX == 0xffffffffUL) + #define MH_UINT32 unsigned short + #elif (UINT_MAX == 0xffffffffUL) + #define MH_UINT32 unsigned int + #elif (ULONG_MAX == 0xffffffffUL) + #define MH_UINT32 unsigned long + #endif +#endif + +#if !defined(MH_UINT32) + #error Unable to determine type name for unsigned 32-bit int +#endif + +/* I'm yet to work on a platform where 'unsigned char' is not 8 bits */ +#define MH_UINT8 unsigned char + + +/* ------------------------------------------------------------------------- */ +/* Prototypes */ + +#ifdef __cplusplus +extern "C" { +#endif + +void PMurHash32_Process(MH_UINT32 *ph1, MH_UINT32 *pcarry, const void *key, int len); +MH_UINT32 PMurHash32_Result(MH_UINT32 h1, MH_UINT32 carry, MH_UINT32 total_length); +MH_UINT32 PMurHash32(MH_UINT32 seed, const void *key, int len); + +void PMurHash32_test(const void *key, int len, MH_UINT32 seed, void *out); + +#ifdef __cplusplus +} +#endif diff --git a/third_party/README b/third_party/README index 796b612ae2..57b6c06af4 100644 --- a/third_party/README +++ b/third_party/README @@ -50,3 +50,9 @@ How to update rb.h ====================== Get the header from git://canonware.com/jemalloc.git + +How to update murmur hash +========================= + +wget http://smhasher.googlecode.com/svn/trunk/PMurHash.c -O PMurHash.c +wget http://smhasher.googlecode.com/svn/trunk/PMurHash.h -O PMurHash.h diff --git a/third_party/murmur_hash2.c b/third_party/murmur_hash2.c deleted file mode 100644 index 6c64bace58..0000000000 --- a/third_party/murmur_hash2.c +++ /dev/null @@ -1,64 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash2, by Austin Appleby - -// Note - This code makes a few assumptions about how your machine behaves - - -// 1. We can read a 4-byte value from any address without crashing -// 2. sizeof(int) == 4 - -// And it has a few limitations - - -// 1. It will not work incrementally. -// 2. It will not produce the same results on little-endian and big-endian -// machines. - -static inline unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) -{ - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - - const unsigned int m = 0x5bd1e995; - const int r = 24; - - // Initialize the hash to a 'random' value - - unsigned int h = seed ^ len; - - // Mix 4 bytes at a time into the hash - - const unsigned char * data = (const unsigned char *)key; - - while(len >= 4) - { - unsigned int k = *(unsigned int *)data; - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - data += 4; - len -= 4; - } - - // Handle the last few bytes of the input array - - switch(len) - { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; - h *= m; - }; - - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; -} -- GitLab