From f998ea39e96d93113823d92727a1faf9860c8ea6 Mon Sep 17 00:00:00 2001
From: Oleg Babin <babinoleg@mail.ru>
Date: Fri, 4 Dec 2020 00:06:38 +0300
Subject: [PATCH] digest: introduce FFI bindings for xxHash32/64

This patch introduces new hash types for digest module - xxHash32
and xxHash64.

Closes #2003

@TarantoolBot document
Title: digest module supports xxHash32/64

```lua
-- Examples below demonstrate xxHash32.
-- xxHash64 has exactly the same interface

-- Calculate the 32-bits hash (default seed is 0).
digest.xxhash32(string[, seed])

-- Streaming
-- Start a new hash by initializing state with a seed.
-- If no value provided, 0 is used as default.
xxhash = digest.xxhash32.new([seed])
-- Also it's possible to specify seed manually. If no value
-- provided a value initially passed to "new" is used.
-- Here and below "seed" expected to be unsigned
-- number. Function returns nothing.
xxhash:clear([seed])
-- Feed the hash state by calling "update" as many times as
-- necessary. Function returns nothing.
xxhash:update('string')
-- Produce a hash value.
xxhash:result()
```
---
 .../unreleased/add-xxhash-to-digest-module.md |   4 +
 src/exports.h                                 |  10 +
 src/lua/digest.lua                            | 113 ++++++++++
 test/app/digest.result                        | 195 +++++++++++++++++-
 test/app/digest.test.lua                      |  65 +++++-
 5 files changed, 378 insertions(+), 9 deletions(-)
 create mode 100644 changelogs/unreleased/add-xxhash-to-digest-module.md

diff --git a/changelogs/unreleased/add-xxhash-to-digest-module.md b/changelogs/unreleased/add-xxhash-to-digest-module.md
new file mode 100644
index 0000000000..f417ad84b3
--- /dev/null
+++ b/changelogs/unreleased/add-xxhash-to-digest-module.md
@@ -0,0 +1,4 @@
+## feature/lua/digest
+
+ * Introduce new hash types in digest module - `xxhash32` and `xxhash64`
+   (gh-2003).
diff --git a/src/exports.h b/src/exports.h
index 41357636af..a4f3833cc6 100644
--- a/src/exports.h
+++ b/src/exports.h
@@ -521,3 +521,13 @@ EXPORT(uri_format)
 EXPORT(uri_parse)
 EXPORT(uuid_nil)
 EXPORT(uuid_unpack)
+EXPORT(XXH32)
+EXPORT(XXH32_copyState)
+EXPORT(XXH32_digest)
+EXPORT(XXH32_reset)
+EXPORT(XXH32_update)
+EXPORT(XXH64)
+EXPORT(XXH64_copyState)
+EXPORT(XXH64_digest)
+EXPORT(XXH64_reset)
+EXPORT(XXH64_update)
diff --git a/src/lua/digest.lua b/src/lua/digest.lua
index 54a09c2b1a..12d0ee2ced 100644
--- a/src/lua/digest.lua
+++ b/src/lua/digest.lua
@@ -31,6 +31,50 @@ ffi.cdef[[
     void PMurHash32_Process(uint32_t *ph1, uint32_t *pcarry, const void *key, int len);
     uint32_t PMurHash32_Result(uint32_t h1, uint32_t carry, uint32_t total_length);
     uint32_t PMurHash32(uint32_t seed, const void *key, int len);
+
+    /* from third_party/zstd/lib/common/xxhash.c */
+    typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+    struct XXH32_state_s {
+        unsigned total_len_32;
+        unsigned large_len;
+        unsigned v1;
+        unsigned v2;
+        unsigned v3;
+        unsigned v4;
+        unsigned mem32[4];   /* buffer defined as U32 for alignment */
+        unsigned memsize;
+        unsigned reserved;   /* never read nor write, will be removed in a future version */
+    };
+
+    struct XXH64_state_s {
+        unsigned long long total_len;
+        unsigned long long v1;
+        unsigned long long v2;
+        unsigned long long v3;
+        unsigned long long v4;
+        unsigned long long mem64[4];   /* buffer defined as U64 for alignment */
+        unsigned memsize;
+        unsigned reserved[2];          /* never read nor write, will be removed in a future version */
+    };
+
+    typedef unsigned int       XXH32_hash_t;
+    typedef unsigned long long XXH64_hash_t;
+    XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed);
+    XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
+
+    typedef struct XXH32_state_s XXH32_state_t;
+    typedef struct XXH64_state_s XXH64_state_t;
+
+    XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned int seed);
+    XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+    XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+    XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+    XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+    XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+    void XXH32_copyState(XXH32_state_t* restrict dst_state, const XXH32_state_t* restrict src_state);
+    void XXH64_copyState(XXH64_state_t* restrict dst_state, const XXH64_state_t* restrict src_state);
 ]]
 
 local builtin = ffi.C
@@ -280,4 +324,73 @@ m['aes256cbc'] = {
     end
 }
 
+for _, var in ipairs({'32', '64'}) do
+    local xxHash
+
+    local xxh_template = 'XXH%s_%s'
+    local update_fn_name = string.format(xxh_template, var, 'update')
+    local digest_fn_name = string.format(xxh_template, var, 'digest')
+    local reset_fn_name = string.format(xxh_template, var, 'reset')
+    local copy_fn_name = string.format(xxh_template, var, 'copyState')
+
+    local function update(self, str)
+        if type(str) ~= 'string' then
+            local message = string.format("Usage xxhash%s:update(string)", var)
+            error(message, 2)
+        end
+        builtin[update_fn_name](self.value, str, #str)
+    end
+
+    local function result(self)
+        return builtin[digest_fn_name](self.value)
+    end
+
+    local function clear(self, seed)
+        if seed == nil then
+            seed = self.default_seed
+        end
+        builtin[reset_fn_name](self.value, seed)
+    end
+
+    local function copy(self)
+        local copy = xxHash.new()
+        builtin[copy_fn_name](copy.value, self.value)
+        return copy
+    end
+
+    local state_type_name = string.format(xxh_template, var, 'state_t')
+    local XXH_state_t = ffi.typeof(state_type_name)
+
+    xxHash = {
+        new = function(seed)
+            local self = {
+                update = update,
+                result = result,
+                clear = clear,
+                copy = copy,
+                value = ffi.new(XXH_state_t),
+                default_seed = seed or 0,
+            }
+            self:clear(self.default_seed)
+            return self
+        end,
+    }
+
+    local call_fn_name = 'XXH' .. var
+    setmetatable(xxHash, {
+        __call = function(_, str, seed)
+            if type(str) ~= 'string' then
+                local message = string.format("Usage digest.xxhash%s(string[, unsigned number])", var)
+                error(message, 2)
+            end
+            if seed == nil then
+                seed = 0
+            end
+            return builtin[call_fn_name](str, #str, seed)
+        end,
+    })
+
+    m['xxhash' .. var] = xxHash
+end
+
 return m
diff --git a/test/app/digest.result b/test/app/digest.result
index d946c6a3bf..40e49ace5b 100644
--- a/test/app/digest.result
+++ b/test/app/digest.result
@@ -572,13 +572,6 @@ err:match("number")
 ---
 - number
 ...
-digest = nil
----
-...
-test_run:cmd("clear filter")
----
-- true
-...
 -- gh-3396: fiber-safe pbkdf2
 res = {}
 ---
@@ -636,3 +629,191 @@ res
   - bafac115a0022b2894f2983b5b5102455bdd3ba7cfbeb09f219a9fde8f3ee6a9
   - bafac115a0022b2894f2983b5b5102455bdd3ba7cfbeb09f219a9fde8f3ee6a9
 ...
+--
+-- gh-2003 xxHash.
+--
+xxhash32 = digest.xxhash32.new()
+---
+...
+xxhash32:result()
+---
+- 46947589
+...
+xxhash64 = digest.xxhash64.new()
+---
+...
+xxhash64:result()
+---
+- 17241709254077376921ULL
+...
+-- New takes seed optionally.
+digest.xxhash32.new(1):result()
+---
+- 187479954
+...
+digest.xxhash64.new(1):result()
+---
+- 15397730242686860875ULL
+...
+-- String is expected as input value.
+digest.xxhash32(1)
+---
+- error: Usage digest.xxhash32(string[, unsigned number])
+...
+digest.xxhash64(1)
+---
+- error: Usage digest.xxhash64(string[, unsigned number])
+...
+digest.xxhash32.new():update(1)
+---
+- error: Usage xxhash32:update(string)
+...
+digest.xxhash64.new():update(1)
+---
+- error: Usage xxhash64:update(string)
+...
+-- Seed is an optional second argument (default = 0).
+digest.xxhash32('12345')
+---
+- 3003995828
+...
+digest.xxhash32('12345', 0)
+---
+- 3003995828
+...
+digest.xxhash32('12345', 1)
+---
+- 2544060598
+...
+xxhash32:result()
+---
+- 46947589
+...
+xxhash32:clear(1)
+---
+...
+xxhash32:result()
+---
+- 187479954
+...
+xxhash32:update('123')
+---
+...
+xxhash32:result()
+---
+- 2569538424
+...
+xxhash32:update('45')
+---
+...
+xxhash32:result()
+---
+- 2544060598
+...
+xxhash32:clear()
+---
+...
+xxhash32:result()
+---
+- 46947589
+...
+xxhash32_copy = xxhash32:copy()
+---
+...
+xxhash32_copy:result()
+---
+- 46947589
+...
+xxhash32_copy ~= xxhash32
+---
+- true
+...
+xxhash32_copy:clear(1ULL)
+---
+...
+xxhash32_copy:result()
+---
+- 187479954
+...
+xxhash32 = nil
+---
+...
+xxhash32_copy = nil
+---
+...
+-- Seed is an optional second argument (default = 0).
+digest.xxhash64('12345')
+---
+- 14335752410685132726ULL
+...
+digest.xxhash64('12345', 0)
+---
+- 14335752410685132726ULL
+...
+digest.xxhash64('12345', 1)
+---
+- 10037897083593476069ULL
+...
+xxhash64:result()
+---
+- 17241709254077376921ULL
+...
+xxhash64:clear(1)
+---
+...
+xxhash64:result()
+---
+- 15397730242686860875ULL
+...
+xxhash64:update('123')
+---
+...
+xxhash64:result()
+---
+- 5440451180712653975ULL
+...
+xxhash64:update('45')
+---
+...
+xxhash64:result()
+---
+- 10037897083593476069ULL
+...
+xxhash64:clear()
+---
+...
+xxhash64:result()
+---
+- 17241709254077376921ULL
+...
+xxhash64_copy = xxhash64:copy()
+---
+...
+xxhash64_copy:result()
+---
+- 17241709254077376921ULL
+...
+xxhash64_copy ~= xxhash64
+---
+- true
+...
+xxhash64_copy:clear(1ULL)
+---
+...
+xxhash64_copy:result()
+---
+- 15397730242686860875ULL
+...
+xxhash64 = nil
+---
+...
+xxhash64_copy = nil
+---
+...
+test_run:cmd("clear filter")
+---
+- true
+...
+digest = nil
+---
+...
diff --git a/test/app/digest.test.lua b/test/app/digest.test.lua
index 7ecda91bcb..d2cc9d39cd 100644
--- a/test/app/digest.test.lua
+++ b/test/app/digest.test.lua
@@ -183,8 +183,6 @@ err:match("Usage")
 s, err = pcall(digest.pbkdf2_hex, "password", "salt", "lol", "lol")
 s
 err:match("number")
-digest = nil
-test_run:cmd("clear filter")
 
 -- gh-3396: fiber-safe pbkdf2
 res = {}
@@ -203,3 +201,66 @@ _ = fiber.create(test_pbkdf2)
 _ = sentry:get()
 _ = sentry:get()
 res
+
+--
+-- gh-2003 xxHash.
+--
+xxhash32 = digest.xxhash32.new()
+xxhash32:result()
+xxhash64 = digest.xxhash64.new()
+xxhash64:result()
+
+-- New takes seed optionally.
+digest.xxhash32.new(1):result()
+digest.xxhash64.new(1):result()
+
+-- String is expected as input value.
+digest.xxhash32(1)
+digest.xxhash64(1)
+digest.xxhash32.new():update(1)
+digest.xxhash64.new():update(1)
+
+-- Seed is an optional second argument (default = 0).
+digest.xxhash32('12345')
+digest.xxhash32('12345', 0)
+digest.xxhash32('12345', 1)
+xxhash32:result()
+xxhash32:clear(1)
+xxhash32:result()
+xxhash32:update('123')
+xxhash32:result()
+xxhash32:update('45')
+xxhash32:result()
+xxhash32:clear()
+xxhash32:result()
+xxhash32_copy = xxhash32:copy()
+xxhash32_copy:result()
+xxhash32_copy ~= xxhash32
+xxhash32_copy:clear(1ULL)
+xxhash32_copy:result()
+xxhash32 = nil
+xxhash32_copy = nil
+
+-- Seed is an optional second argument (default = 0).
+digest.xxhash64('12345')
+digest.xxhash64('12345', 0)
+digest.xxhash64('12345', 1)
+xxhash64:result()
+xxhash64:clear(1)
+xxhash64:result()
+xxhash64:update('123')
+xxhash64:result()
+xxhash64:update('45')
+xxhash64:result()
+xxhash64:clear()
+xxhash64:result()
+xxhash64_copy = xxhash64:copy()
+xxhash64_copy:result()
+xxhash64_copy ~= xxhash64
+xxhash64_copy:clear(1ULL)
+xxhash64_copy:result()
+xxhash64 = nil
+xxhash64_copy = nil
+
+test_run:cmd("clear filter")
+digest = nil
-- 
GitLab