From aff6235cf087412f7bfd290898bace4bb4180ecd Mon Sep 17 00:00:00 2001
From: GeorgyKirichenko <kirichenkoga@gmail.com>
Date: Wed, 28 Jun 2017 01:36:40 +0300
Subject: [PATCH] Add iconv support

Iconv is a library to convert a sequence of characters in one
character encoding to a sequence of characters in another character
encoding. Example below converts utf-16 big endian string into utf-8
string:

    convertor = require('iconv').new('UTF-16BE', 'UTF-8')
    converted_string = convertor(source_string)

Closes #2587
---
 src/CMakeLists.txt          |  1 +
 src/lua/iconv.lua           | 87 +++++++++++++++++++++++++++++++++++++
 src/lua/init.c              |  2 +
 test/app-tap/iconv.test.lua | 49 +++++++++++++++++++++
 4 files changed, 139 insertions(+)
 create mode 100644 src/lua/iconv.lua
 create mode 100755 test/app-tap/iconv.test.lua

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f99a747e09..7ea7394a3d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -45,6 +45,7 @@ lua_source(lua_sources lua/trigger.lua)
 lua_source(lua_sources lua/table.lua)
 lua_source(lua_sources ../third_party/luafun/fun.lua)
 lua_source(lua_sources lua/httpc.lua)
+lua_source(lua_sources lua/iconv.lua)
 # LuaJIT jit.* library
 lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/src/jit/bc.lua")
 lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/src/jit/bcsave.lua")
diff --git a/src/lua/iconv.lua b/src/lua/iconv.lua
new file mode 100644
index 0000000000..9edd0aa12d
--- /dev/null
+++ b/src/lua/iconv.lua
@@ -0,0 +1,87 @@
+local ffi    = require('ffi')
+local errno  = require('errno')
+local buffer = require('buffer')
+
+ffi.cdef[[
+typedef struct iconv *iconv_t;
+iconv_t iconv_open(const char *tocode, const char *fromcode);
+void    iconv_close(iconv_t cd);
+size_t  iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
+              char **outbuf, size_t *outbytesleft);
+]]
+
+local iconv_t         = ffi.typeof('struct iconv')
+local char_ptr_arr_t  = ffi.typeof('char *[1]')
+local cchar_ptr_arr_t = ffi.typeof('const char *[1]')
+local cchar_ptr_t     = ffi.typeof('const char *')
+local size_t_arr_t    = ffi.typeof('size_t [1]')
+
+local E2BIG    = errno['E2BIG']
+local EINVAL   = errno['EINVAL']
+local EILSEQ   = errno['EILSEQ']
+local BUF_SIZE = 64
+
+local conv_rv_error = ffi.cast('void *', -1)
+
+local function iconv_convert(iconv, data)
+    if not ffi.istype(iconv_t, iconv) then
+        error("Usage: iconv:convert(data: string)")
+    end
+    local data_len   = data:len()
+    local data_ptr   = cchar_ptr_arr_t(cchar_ptr_t(data))
+    local data_left  = size_t_arr_t(data_len)
+
+    -- prepare at lease BUF_SIZE and at most data_len bytes in shared buffer
+    local output_len = data_len >= BUF_SIZE and data_len or BUF_SIZE
+    local buf      = buffer.IBUF_SHARED;
+    local buf_ptr  = char_ptr_arr_t()
+    local buf_left = size_t_arr_t()
+    buf:reset()
+
+    while data_left[0] > 0 do
+        buf_ptr[0]  = buf:reserve(output_len)
+        buf_left[0] = buf:unused()
+        local res = ffi.C.iconv(iconv, data_ptr, data_left,
+                                buf_ptr, buf_left)
+        if res == -1 and errno() ~= E2BIG then
+            ffi.C.iconv(iconv, nil, nil, nil, nil)
+            if errno() == EINVAL then
+                error('Invalid multibyte sequence')
+            end
+            if errno() == EILSEQ then
+                error('Incomplete multibyte sequence')
+            end
+            error('Unknown conversion error: ' .. errno.strerror())
+        end
+        buf:alloc(buf:unused() - buf_left[0])
+    end
+
+    -- iconv function sets cd's conversion state to the initial state
+    ffi.C.iconv(iconv, nil, nil, nil, nil)
+    local result = ffi.string(buf.rpos, buf:size())
+    buf:reset()
+    return result
+end
+
+local iconv_mt = {
+    __call = iconv_convert,
+    __gc = ffi.C.iconv_close,
+    __tostring = function(iconv) return string.format("iconv: %p", iconv) end
+}
+
+ffi.metatype(iconv_t, iconv_mt)
+
+local function iconv_new(to, from)
+    if type(to) ~= 'string' or type(from) ~= 'string' then
+        error('Usage: iconv.new("CP1251", "KOI8-R")')
+    end
+    local iconv = ffi.C.iconv_open(to, from)
+    if iconv == conv_rv_error then
+        error('iconv: '..errno.strerror())
+    end
+    return iconv;
+end
+
+return {
+    new = iconv_new,
+}
diff --git a/src/lua/init.c b/src/lua/init.c
index 5681f03029..4f6842a232 100644
--- a/src/lua/init.c
+++ b/src/lua/init.c
@@ -93,6 +93,7 @@ extern char strict_lua[],
 	tap_lua[],
 	fio_lua[],
 	argparse_lua[],
+	iconv_lua[],
 	/* jit.* library */
 	vmdef_lua[],
 	bc_lua[],
@@ -141,6 +142,7 @@ static const char *lua_modules[] = {
 	"internal.trigger", trigger_lua,
 	"pwd", pwd_lua,
 	"http.client", httpc_lua,
+	"iconv", iconv_lua,
 	/* jit.* library */
 	"jit.vmdef", vmdef_lua,
 	"jit.bc", bc_lua,
diff --git a/test/app-tap/iconv.test.lua b/test/app-tap/iconv.test.lua
new file mode 100755
index 0000000000..6f6a04b140
--- /dev/null
+++ b/test/app-tap/iconv.test.lua
@@ -0,0 +1,49 @@
+#!/usr/bin/env tarantool
+
+local tap   = require('tap')
+local iconv = require('iconv')
+
+test = tap.test("iconv")
+test:plan(11)
+
+local simple_str  = 'ascii string'
+local cyrillic_str = 'русский текст'
+
+local c_ascii_8 = iconv.new('ASCII', 'UTF-8')
+local c_8_ascii = iconv.new('UTF-8', 'ASCII')
+
+test:is(c_ascii_8(simple_str), simple_str, 'check ascii->utf8 on simple string')
+test:is(c_8_ascii(simple_str), simple_str, 'check utf8->ascii on simple string')
+
+local c16be_8 = iconv.new('UTF-16BE', 'UTF-8')
+local c8_16be = iconv.new('UTF-8', 'UTF-16BE')
+test:is(c16be_8(c8_16be(simple_str)),  simple_str,
+        'UTF conversion with ascii string')
+test:is(c8_16be(c16be_8(cyrillic_str)), cyrillic_str,
+        'UTF conversion with non-ascii symbols')
+
+local c16_16be = iconv.new('UTF-16', 'UTF-16BE')
+local c1251_16 = iconv.new('WINDOWS-1251', 'UTF-16')
+local c8_1251  = iconv.new('UTF-8', 'WINDOWS-1251')
+
+test:is(c8_16be(c16be_8(cyrillic_str)), cyrillic_str,
+        'UTF conversion with non-ascii symbols')
+
+-- test complex converting path
+test:is(c8_1251(c1251_16(c16_16be(c16be_8(cyrillic_str)))), cyrillic_str,
+        'complex multi-format conversion')
+
+-- test huge string
+huge_str = string.rep(cyrillic_str, 50)
+
+test:is(c16be_8(c8_16be(huge_str)), huge_str, "huge string")
+
+local stat, err = pcall(iconv.new, 'NOT EXISTS', 'UTF-8')
+test:is(stat, false, 'error was thrown on bad encoding')
+test:ok(err:match('Invalid') ~= nil, 'correct error')
+
+local stat, err = pcall(c_ascii_8, cyrillic_str)
+test:is(stat, false, 'error was thrown on sequence')
+test:ok(err:match('Incomplete multibyte sequence') ~= nil, 'correct error')
+
+os.exit(test:check() == true and 0 or 1)
-- 
GitLab