From a2c19aa812754451f6c828ce074bf7443847496e Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Sun, 19 May 2019 01:00:38 +0300
Subject: [PATCH] buffer: replace all ffi.new(type[1]) with cached union

Lua, which suffers from lack of ability to pass values by
pointers into FFI functions, nor has an address operator '&' to
take an address of integer or char or anything. Because of that
a user need to either use ffi.new(type[1]) or use static buffer,
but for such small allocations they are both too expensive and
aggravate GC problem.

Now buffer module provides preallocated basic types to use in FFI
functions. The commit is motivated by one another place where
ffi.new('int[1]') appeared, in SWIM module, to obtain payload
size as an out parameter of a C function.
---
 src/lua/buffer.lua       | 42 +++++++++++++++++++++++++++++++++
 src/lua/crypto.lua       | 13 +++++------
 src/lua/msgpackffi.lua   | 50 ++++++++++++++++------------------------
 src/lua/socket.lua       | 25 ++++++++++++--------
 src/lua/string.lua       |  4 ++--
 test/app/buffer.result   | 27 ++++++++++++++++++++++
 test/app/buffer.test.lua | 11 +++++++++
 7 files changed, 123 insertions(+), 49 deletions(-)

diff --git a/src/lua/buffer.lua b/src/lua/buffer.lua
index 503f753c26..ef0118c7c4 100644
--- a/src/lua/buffer.lua
+++ b/src/lua/buffer.lua
@@ -36,6 +36,34 @@ ibuf_reserve_slow(struct ibuf *ibuf, size_t size);
 
 void *
 lua_static_aligned_alloc(size_t size, size_t alignment);
+
+/**
+ * Register is a buffer to use with FFI functions, which usually
+ * operate with pointers to scalar values like int, char, size_t,
+ * void *. To avoid doing 'ffi.new(<type>[1])' on each such FFI
+ * function invocation, a module can use one of attributes of the
+ * register union.
+ *
+ * Naming policy of the attributes is easy to remember:
+ * 'a' for array type + type name first letters + 'p' for pointer.
+ *
+ * For example:
+ * - int[1] - <a>rray of <i>nt - ai;
+ * - const unsigned char *[1] -
+ *       <a>rray of <c>onst <u>nsigned <c>har <p> pointer - acucp.
+ */
+union c_register {
+    size_t as[1];
+    void *ap[1];
+    int ai[1];
+    char ac[1];
+    const unsigned char *acucp[1];
+    unsigned long aul[1];
+    uint16_t u16;
+    uint32_t u32;
+    uint64_t u64;
+    int64_t i64;
+};
 ]]
 
 local builtin = ffi.C
@@ -205,9 +233,23 @@ local function static_alloc(type, size)
     return ffi.new(type..'[?]', size)
 end
 
+--
+-- Sometimes it is wanted to use several temporary registers at
+-- once. For example, when a C function takes 2 C pointers. Then
+-- one register is not enough - its attributes share memory. Note,
+-- registers are not allocated with separate ffi.new calls
+-- deliberately. With a single allocation they fit into 1 cache
+-- line, and reduce the heap fragmentation.
+--
+local reg_array = ffi.new('union c_register[?]', 2)
+
 return {
     ibuf = ibuf_new;
     IBUF_SHARED = ffi.C.tarantool_lua_ibuf;
     READAHEAD = READAHEAD;
     static_alloc = static_alloc,
+    -- Keep reference.
+    reg_array = reg_array,
+    reg1 = reg_array[0],
+    reg2 = reg_array[1]
 }
diff --git a/src/lua/crypto.lua b/src/lua/crypto.lua
index cb37a73fbe..d23ba8e3a7 100644
--- a/src/lua/crypto.lua
+++ b/src/lua/crypto.lua
@@ -2,6 +2,7 @@
 
 local ffi = require('ffi')
 local buffer = require('buffer')
+local reg = buffer.reg1
 
 ffi.cdef[[
     /* from openssl/err.h */
@@ -102,7 +103,6 @@ local function digest_new(digest)
         digest = digest,
         buf = buffer.ibuf(64),
         initialized = false,
-        outl = ffi.new('int[1]')
     }, digest_mt)
     self:init()
     return self
@@ -132,10 +132,10 @@ local function digest_final(self)
         return error('Digest not initialized')
     end
     self.initialized = false
-    if ffi.C.EVP_DigestFinal_ex(self.ctx, self.buf.wpos, self.outl) ~= 1 then
+    if ffi.C.EVP_DigestFinal_ex(self.ctx, self.buf.wpos, reg.ai) ~= 1 then
         return error('Can\'t finalize digest: ' .. openssl_err_str())
     end
-    return ffi.string(self.buf.wpos, self.outl[0])
+    return ffi.string(self.buf.wpos, reg.ai[0])
 end
 
 local function digest_free(self)
@@ -174,9 +174,7 @@ local function hmac_new(digest, key)
     local self = setmetatable({
         ctx = ctx,
         digest = digest,
-        buf = buffer.ibuf(64),
         initialized = false,
-        outl = ffi.new('int[1]')
     }, hmac_mt)
     self:init(key)
     return self
@@ -206,10 +204,11 @@ local function hmac_final(self)
         return error('HMAC not initialized')
     end
     self.initialized = false
-    if ffi.C.HMAC_Final(self.ctx, self.buf.wpos, self.outl) ~= 1 then
+    local buf = buffer.static_alloc('char', 64)
+    if ffi.C.HMAC_Final(self.ctx, buf, reg.ai) ~= 1 then
         return error('Can\'t finalize HMAC: ' .. openssl_err_str())
     end
-    return ffi.string(self.buf.wpos, self.outl[0])
+    return ffi.string(buf, reg.ai[0])
 end
 
 local function hmac_free(self)
diff --git a/src/lua/msgpackffi.lua b/src/lua/msgpackffi.lua
index 609334cffe..bfeedbc4b8 100644
--- a/src/lua/msgpackffi.lua
+++ b/src/lua/msgpackffi.lua
@@ -21,19 +21,10 @@ float
 mp_decode_float(const char **data);
 double
 mp_decode_double(const char **data);
-union tmpint {
-    uint16_t u16;
-    uint32_t u32;
-    uint64_t u64;
-};
 ]])
 
 local strict_alignment = (jit.arch == 'arm')
-
-local tmpint
-if strict_alignment then
-   tmpint = ffi.new('union tmpint[1]')
-end
+local reg = buffer.reg1
 
 local function bswap_u16(num)
     return bit.rshift(bit.bswap(tonumber(num)), 16)
@@ -70,10 +61,10 @@ end
 local encode_u16
 if strict_alignment then
     encode_u16 = function(buf, code, num)
-        tmpint[0].u16 = bswap_u16(num)
+        reg.u16 = bswap_u16(num)
         local p = buf:alloc(3)
         p[0] = code
-        ffi.copy(p + 1, tmpint, 2)
+        ffi.copy(p + 1, reg, 2)
     end
 else
     encode_u16 = function(buf, code, num)
@@ -86,11 +77,10 @@ end
 local encode_u32
 if strict_alignment then
     encode_u32 = function(buf, code, num)
-        tmpint[0].u32 =
-            ffi.cast('uint32_t', bit.bswap(tonumber(num)))
+        reg.u32 = ffi.cast('uint32_t', bit.bswap(tonumber(num)))
         local p = buf:alloc(5)
         p[0] = code
-        ffi.copy(p + 1, tmpint, 4)
+        ffi.copy(p + 1, reg, 4)
     end
 else
     encode_u32 = function(buf, code, num)
@@ -104,10 +94,10 @@ end
 local encode_u64
 if strict_alignment then
     encode_u64 = function(buf, code, num)
-        tmpint[0].u64 = bit.bswap(ffi.cast('uint64_t', num))
+        reg.u64 = bit.bswap(ffi.cast('uint64_t', num))
         local p = buf:alloc(9)
         p[0] = code
-        ffi.copy(p + 1, tmpint, 8)
+        ffi.copy(p + 1, reg, 8)
     end
 else
     encode_u64 = function(buf, code, num)
@@ -324,9 +314,9 @@ end
 local decode_u16
 if strict_alignment then
     decode_u16 = function(data)
-        ffi.copy(tmpint, data[0], 2)
+        ffi.copy(reg, data[0], 2)
         data[0] = data[0] + 2
-        return tonumber(bswap_u16(tmpint[0].u16))
+        return tonumber(bswap_u16(reg.u16))
     end
 else
     decode_u16 = function(data)
@@ -339,10 +329,10 @@ end
 local decode_u32
 if strict_alignment then
     decode_u32 = function(data)
-        ffi.copy(tmpint, data[0], 4)
+        ffi.copy(reg, data[0], 4)
         data[0] = data[0] + 4
         return tonumber(
-            ffi.cast('uint32_t', bit.bswap(tonumber(tmpint[0].u32))))
+            ffi.cast('uint32_t', bit.bswap(tonumber(reg.u32))))
     end
 else
     decode_u32 = function(data)
@@ -356,9 +346,9 @@ end
 local decode_u64
 if strict_alignment then
     decode_u64 = function(data)
-        ffi.copy(tmpint, data[0], 8);
+        ffi.copy(reg, data[0], 8);
         data[0] = data[0] + 8
-        local num = bit.bswap(tmpint[0].u64)
+        local num = bit.bswap(reg.u64)
         if num <= DBL_INT_MAX then
             return tonumber(num) -- return as 'number'
         end
@@ -385,8 +375,8 @@ end
 local decode_i16
 if strict_alignment then
     decode_i16 = function(data)
-        ffi.copy(tmpint, data[0], 2)
-        local num = bswap_u16(tmpint[0].u16)
+        ffi.copy(reg, data[0], 2)
+        local num = bswap_u16(reg.u16)
         data[0] = data[0] + 2
         -- note: this double cast is actually necessary
         return tonumber(ffi.cast('int16_t', ffi.cast('uint16_t', num)))
@@ -403,8 +393,8 @@ end
 local decode_i32
 if strict_alignment then
     decode_i32 = function(data)
-        ffi.copy(tmpint, data[0], 4)
-        local num = bit.bswap(tonumber(tmpint[0].u32))
+        ffi.copy(reg, data[0], 4)
+        local num = bit.bswap(tonumber(reg.u32))
         data[0] = data[0] + 4
         return num
     end
@@ -419,9 +409,9 @@ end
 local decode_i64
 if strict_alignment then
     decode_i64 = function(data)
-        ffi.copy(tmpint, data[0], 8)
+        ffi.copy(reg, data[0], 8)
         data[0] = data[0] + 8
-        local num = bit.bswap(ffi.cast('int64_t', tmpint[0].u64))
+        local num = bit.bswap(reg.i64)
         if num >= -DBL_INT_MAX and num <= DBL_INT_MAX then
             return tonumber(num) -- return as 'number'
         end
@@ -552,7 +542,7 @@ end
 -- element. It is significally faster on LuaJIT to use double pointer than
 -- return result, newpos.
 --
-local bufp = ffi.new('const unsigned char *[1]');
+local bufp = reg.acucp;
 
 local function check_offset(offset, len)
     if offset == nil then
diff --git a/src/lua/socket.lua b/src/lua/socket.lua
index 2bb5712ebf..2dba0a8d2a 100644
--- a/src/lua/socket.lua
+++ b/src/lua/socket.lua
@@ -10,6 +10,8 @@ local fiber = require('fiber')
 local fio = require('fio')
 local log = require('log')
 local buffer = require('buffer')
+local reg1 = buffer.reg1
+local reg2 = buffer.reg2
 local static_alloc = buffer.static_alloc
 
 local format = string.format
@@ -473,9 +475,9 @@ local function socket_setsockopt(self, level, name, value)
     end
 
     if info.type == 1 then
-        local value = ffi.new("int[1]", value)
+        reg1.ai[0] = value
         local res = ffi.C.setsockopt(fd,
-            level, info.iname, value, ffi.sizeof('int'))
+            level, info.iname, reg1.ai, ffi.sizeof('int'))
 
         if res < 0 then
             self._errno = boxerrno()
@@ -517,8 +519,10 @@ local function socket_getsockopt(self, level, name)
     self._errno = nil
 
     if info.type == 1 then
-        local value = ffi.new("int[1]", 0)
-        local len = ffi.new("size_t[1]", ffi.sizeof('int'))
+        local value = reg1.ai
+        value[0] = 0
+        local len = reg2.as
+        len[0] = ffi.sizeof('int')
         local res = ffi.C.getsockopt(fd, level, info.iname, value, len)
 
         if res < 0 then
@@ -533,8 +537,9 @@ local function socket_getsockopt(self, level, name)
     end
 
     if info.type == 2 then
-        local value = ffi.new("char[256]", { 0 })
-        local len = ffi.new("size_t[1]", 256)
+        local value = static_alloc('char', 256)
+        local len = reg1.as
+        len[0] = 256
         local res = ffi.C.getsockopt(fd, level, info.iname, value, len)
         if res < 0 then
             self._errno = boxerrno()
@@ -556,8 +561,9 @@ local function socket_linger(self, active, timeout)
     local info = internal.SO_OPT[level].SO_LINGER
     self._errno = nil
     if active == nil then
-        local value = ffi.new("linger_t[1]")
-        local len = ffi.new("size_t[1]", 2 * ffi.sizeof('int'))
+        local value = static_alloc('linger_t')
+        local len = reg1.as
+        len[0] = ffi.sizeof('linger_t')
         local res = ffi.C.getsockopt(fd, level, info.iname, value, len)
         if res < 0 then
             self._errno = boxerrno()
@@ -799,8 +805,7 @@ local function get_recv_size(self, size)
             -- them using message peek.
             local iflags = get_iflags(internal.SEND_FLAGS, {'MSG_PEEK'})
             assert(iflags ~= nil)
-            local buf = ffi.new('char[?]', 1)
-            size = tonumber(ffi.C.recv(fd, buf, 1, iflags))
+            size = tonumber(ffi.C.recv(fd, reg1.ac, 1, iflags))
             -- Prevent race condition: proceed with the case when
             -- a datagram of length > 0 has been arrived after the
             -- getsockopt call above.
diff --git a/src/lua/string.lua b/src/lua/string.lua
index bb4adfc784..6e12c59ae7 100644
--- a/src/lua/string.lua
+++ b/src/lua/string.lua
@@ -15,8 +15,8 @@ ffi.cdef[[
 ]]
 
 local c_char_ptr     = ffi.typeof('const char *')
-local strip_newstart = ffi.new("unsigned long[1]")
-local strip_newlen   = ffi.new("unsigned long[1]")
+local strip_newstart = buffer.reg1.aul
+local strip_newlen   = buffer.reg2.aul
 
 local memcmp  = ffi.C.memcmp
 local memmem  = ffi.C.memmem
diff --git a/test/app/buffer.result b/test/app/buffer.result
index e0aad9bc7a..beccc5a877 100644
--- a/test/app/buffer.result
+++ b/test/app/buffer.result
@@ -7,6 +7,33 @@ buffer = require('buffer')
 ffi = require('ffi')
 ---
 ...
+-- Registers.
+reg1 = buffer.reg1
+---
+...
+reg1.u16 = 100
+---
+...
+u16 = ffi.new('uint16_t[1]')
+---
+...
+ffi.copy(u16, reg1, 2)
+---
+...
+u16[0]
+---
+- 100
+...
+u16[0] = 200
+---
+...
+ffi.copy(reg1, u16, 2)
+---
+...
+reg1.u16
+---
+- 200
+...
 -- Alignment.
 _ = buffer.static_alloc('char') -- This makes buffer pos unaligned.
 ---
diff --git a/test/app/buffer.test.lua b/test/app/buffer.test.lua
index ba7299f33f..a1c3806807 100644
--- a/test/app/buffer.test.lua
+++ b/test/app/buffer.test.lua
@@ -2,6 +2,17 @@ test_run = require('test_run').new()
 buffer = require('buffer')
 ffi = require('ffi')
 
+-- Registers.
+reg1 = buffer.reg1
+reg1.u16 = 100
+u16 = ffi.new('uint16_t[1]')
+ffi.copy(u16, reg1, 2)
+u16[0]
+
+u16[0] = 200
+ffi.copy(reg1, u16, 2)
+reg1.u16
+
 -- Alignment.
 _ = buffer.static_alloc('char') -- This makes buffer pos unaligned.
 p = buffer.static_alloc('int')
-- 
GitLab