From aff6235cf087412f7bfd290898bace4bb4180ecd Mon Sep 17 00:00:00 2001 From: GeorgyKirichenko <kirichenkoga@gmail.com> Date: Wed, 28 Jun 2017 01:36:40 +0300 Subject: [PATCH] Add iconv support Iconv is a library to convert a sequence of characters in one character encoding to a sequence of characters in another character encoding. Example below converts utf-16 big endian string into utf-8 string: convertor = require('iconv').new('UTF-16BE', 'UTF-8') converted_string = convertor(source_string) Closes #2587 --- src/CMakeLists.txt | 1 + src/lua/iconv.lua | 87 +++++++++++++++++++++++++++++++++++++ src/lua/init.c | 2 + test/app-tap/iconv.test.lua | 49 +++++++++++++++++++++ 4 files changed, 139 insertions(+) create mode 100644 src/lua/iconv.lua create mode 100755 test/app-tap/iconv.test.lua diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f99a747e09..7ea7394a3d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -45,6 +45,7 @@ lua_source(lua_sources lua/trigger.lua) lua_source(lua_sources lua/table.lua) lua_source(lua_sources ../third_party/luafun/fun.lua) lua_source(lua_sources lua/httpc.lua) +lua_source(lua_sources lua/iconv.lua) # LuaJIT jit.* library lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/src/jit/bc.lua") lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/src/jit/bcsave.lua") diff --git a/src/lua/iconv.lua b/src/lua/iconv.lua new file mode 100644 index 0000000000..9edd0aa12d --- /dev/null +++ b/src/lua/iconv.lua @@ -0,0 +1,87 @@ +local ffi = require('ffi') +local errno = require('errno') +local buffer = require('buffer') + +ffi.cdef[[ +typedef struct iconv *iconv_t; +iconv_t iconv_open(const char *tocode, const char *fromcode); +void iconv_close(iconv_t cd); +size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft); +]] + +local iconv_t = ffi.typeof('struct iconv') +local char_ptr_arr_t = ffi.typeof('char *[1]') +local cchar_ptr_arr_t = ffi.typeof('const char *[1]') +local cchar_ptr_t = ffi.typeof('const char *') +local size_t_arr_t = ffi.typeof('size_t [1]') + +local E2BIG = errno['E2BIG'] +local EINVAL = errno['EINVAL'] +local EILSEQ = errno['EILSEQ'] +local BUF_SIZE = 64 + +local conv_rv_error = ffi.cast('void *', -1) + +local function iconv_convert(iconv, data) + if not ffi.istype(iconv_t, iconv) then + error("Usage: iconv:convert(data: string)") + end + local data_len = data:len() + local data_ptr = cchar_ptr_arr_t(cchar_ptr_t(data)) + local data_left = size_t_arr_t(data_len) + + -- prepare at lease BUF_SIZE and at most data_len bytes in shared buffer + local output_len = data_len >= BUF_SIZE and data_len or BUF_SIZE + local buf = buffer.IBUF_SHARED; + local buf_ptr = char_ptr_arr_t() + local buf_left = size_t_arr_t() + buf:reset() + + while data_left[0] > 0 do + buf_ptr[0] = buf:reserve(output_len) + buf_left[0] = buf:unused() + local res = ffi.C.iconv(iconv, data_ptr, data_left, + buf_ptr, buf_left) + if res == -1 and errno() ~= E2BIG then + ffi.C.iconv(iconv, nil, nil, nil, nil) + if errno() == EINVAL then + error('Invalid multibyte sequence') + end + if errno() == EILSEQ then + error('Incomplete multibyte sequence') + end + error('Unknown conversion error: ' .. errno.strerror()) + end + buf:alloc(buf:unused() - buf_left[0]) + end + + -- iconv function sets cd's conversion state to the initial state + ffi.C.iconv(iconv, nil, nil, nil, nil) + local result = ffi.string(buf.rpos, buf:size()) + buf:reset() + return result +end + +local iconv_mt = { + __call = iconv_convert, + __gc = ffi.C.iconv_close, + __tostring = function(iconv) return string.format("iconv: %p", iconv) end +} + +ffi.metatype(iconv_t, iconv_mt) + +local function iconv_new(to, from) + if type(to) ~= 'string' or type(from) ~= 'string' then + error('Usage: iconv.new("CP1251", "KOI8-R")') + end + local iconv = ffi.C.iconv_open(to, from) + if iconv == conv_rv_error then + error('iconv: '..errno.strerror()) + end + return iconv; +end + +return { + new = iconv_new, +} diff --git a/src/lua/init.c b/src/lua/init.c index 5681f03029..4f6842a232 100644 --- a/src/lua/init.c +++ b/src/lua/init.c @@ -93,6 +93,7 @@ extern char strict_lua[], tap_lua[], fio_lua[], argparse_lua[], + iconv_lua[], /* jit.* library */ vmdef_lua[], bc_lua[], @@ -141,6 +142,7 @@ static const char *lua_modules[] = { "internal.trigger", trigger_lua, "pwd", pwd_lua, "http.client", httpc_lua, + "iconv", iconv_lua, /* jit.* library */ "jit.vmdef", vmdef_lua, "jit.bc", bc_lua, diff --git a/test/app-tap/iconv.test.lua b/test/app-tap/iconv.test.lua new file mode 100755 index 0000000000..6f6a04b140 --- /dev/null +++ b/test/app-tap/iconv.test.lua @@ -0,0 +1,49 @@ +#!/usr/bin/env tarantool + +local tap = require('tap') +local iconv = require('iconv') + +test = tap.test("iconv") +test:plan(11) + +local simple_str = 'ascii string' +local cyrillic_str = 'руÑÑкий текÑÑ‚' + +local c_ascii_8 = iconv.new('ASCII', 'UTF-8') +local c_8_ascii = iconv.new('UTF-8', 'ASCII') + +test:is(c_ascii_8(simple_str), simple_str, 'check ascii->utf8 on simple string') +test:is(c_8_ascii(simple_str), simple_str, 'check utf8->ascii on simple string') + +local c16be_8 = iconv.new('UTF-16BE', 'UTF-8') +local c8_16be = iconv.new('UTF-8', 'UTF-16BE') +test:is(c16be_8(c8_16be(simple_str)), simple_str, + 'UTF conversion with ascii string') +test:is(c8_16be(c16be_8(cyrillic_str)), cyrillic_str, + 'UTF conversion with non-ascii symbols') + +local c16_16be = iconv.new('UTF-16', 'UTF-16BE') +local c1251_16 = iconv.new('WINDOWS-1251', 'UTF-16') +local c8_1251 = iconv.new('UTF-8', 'WINDOWS-1251') + +test:is(c8_16be(c16be_8(cyrillic_str)), cyrillic_str, + 'UTF conversion with non-ascii symbols') + +-- test complex converting path +test:is(c8_1251(c1251_16(c16_16be(c16be_8(cyrillic_str)))), cyrillic_str, + 'complex multi-format conversion') + +-- test huge string +huge_str = string.rep(cyrillic_str, 50) + +test:is(c16be_8(c8_16be(huge_str)), huge_str, "huge string") + +local stat, err = pcall(iconv.new, 'NOT EXISTS', 'UTF-8') +test:is(stat, false, 'error was thrown on bad encoding') +test:ok(err:match('Invalid') ~= nil, 'correct error') + +local stat, err = pcall(c_ascii_8, cyrillic_str) +test:is(stat, false, 'error was thrown on sequence') +test:ok(err:match('Incomplete multibyte sequence') ~= nil, 'correct error') + +os.exit(test:check() == true and 0 or 1) -- GitLab