Skip to content
Snippets Groups Projects
Commit aff6235c authored by Georgy Kirichenko's avatar Georgy Kirichenko Committed by Roman Tsisyk
Browse files

Add iconv support

Iconv is a library to convert a sequence of characters in one
character encoding to a sequence of characters in another character
encoding. Example below converts utf-16 big endian string into utf-8
string:

    convertor = require('iconv').new('UTF-16BE', 'UTF-8')
    converted_string = convertor(source_string)

Closes #2587
parent 3a73e5dc
No related branches found
No related tags found
No related merge requests found
......@@ -45,6 +45,7 @@ lua_source(lua_sources lua/trigger.lua)
lua_source(lua_sources lua/table.lua)
lua_source(lua_sources ../third_party/luafun/fun.lua)
lua_source(lua_sources lua/httpc.lua)
lua_source(lua_sources lua/iconv.lua)
# LuaJIT jit.* library
lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/src/jit/bc.lua")
lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/src/jit/bcsave.lua")
......
local ffi = require('ffi')
local errno = require('errno')
local buffer = require('buffer')
ffi.cdef[[
typedef struct iconv *iconv_t;
iconv_t iconv_open(const char *tocode, const char *fromcode);
void iconv_close(iconv_t cd);
size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
char **outbuf, size_t *outbytesleft);
]]
local iconv_t = ffi.typeof('struct iconv')
local char_ptr_arr_t = ffi.typeof('char *[1]')
local cchar_ptr_arr_t = ffi.typeof('const char *[1]')
local cchar_ptr_t = ffi.typeof('const char *')
local size_t_arr_t = ffi.typeof('size_t [1]')
local E2BIG = errno['E2BIG']
local EINVAL = errno['EINVAL']
local EILSEQ = errno['EILSEQ']
local BUF_SIZE = 64
local conv_rv_error = ffi.cast('void *', -1)
local function iconv_convert(iconv, data)
if not ffi.istype(iconv_t, iconv) then
error("Usage: iconv:convert(data: string)")
end
local data_len = data:len()
local data_ptr = cchar_ptr_arr_t(cchar_ptr_t(data))
local data_left = size_t_arr_t(data_len)
-- prepare at lease BUF_SIZE and at most data_len bytes in shared buffer
local output_len = data_len >= BUF_SIZE and data_len or BUF_SIZE
local buf = buffer.IBUF_SHARED;
local buf_ptr = char_ptr_arr_t()
local buf_left = size_t_arr_t()
buf:reset()
while data_left[0] > 0 do
buf_ptr[0] = buf:reserve(output_len)
buf_left[0] = buf:unused()
local res = ffi.C.iconv(iconv, data_ptr, data_left,
buf_ptr, buf_left)
if res == -1 and errno() ~= E2BIG then
ffi.C.iconv(iconv, nil, nil, nil, nil)
if errno() == EINVAL then
error('Invalid multibyte sequence')
end
if errno() == EILSEQ then
error('Incomplete multibyte sequence')
end
error('Unknown conversion error: ' .. errno.strerror())
end
buf:alloc(buf:unused() - buf_left[0])
end
-- iconv function sets cd's conversion state to the initial state
ffi.C.iconv(iconv, nil, nil, nil, nil)
local result = ffi.string(buf.rpos, buf:size())
buf:reset()
return result
end
local iconv_mt = {
__call = iconv_convert,
__gc = ffi.C.iconv_close,
__tostring = function(iconv) return string.format("iconv: %p", iconv) end
}
ffi.metatype(iconv_t, iconv_mt)
local function iconv_new(to, from)
if type(to) ~= 'string' or type(from) ~= 'string' then
error('Usage: iconv.new("CP1251", "KOI8-R")')
end
local iconv = ffi.C.iconv_open(to, from)
if iconv == conv_rv_error then
error('iconv: '..errno.strerror())
end
return iconv;
end
return {
new = iconv_new,
}
......@@ -93,6 +93,7 @@ extern char strict_lua[],
tap_lua[],
fio_lua[],
argparse_lua[],
iconv_lua[],
/* jit.* library */
vmdef_lua[],
bc_lua[],
......@@ -141,6 +142,7 @@ static const char *lua_modules[] = {
"internal.trigger", trigger_lua,
"pwd", pwd_lua,
"http.client", httpc_lua,
"iconv", iconv_lua,
/* jit.* library */
"jit.vmdef", vmdef_lua,
"jit.bc", bc_lua,
......
#!/usr/bin/env tarantool
local tap = require('tap')
local iconv = require('iconv')
test = tap.test("iconv")
test:plan(11)
local simple_str = 'ascii string'
local cyrillic_str = 'русский текст'
local c_ascii_8 = iconv.new('ASCII', 'UTF-8')
local c_8_ascii = iconv.new('UTF-8', 'ASCII')
test:is(c_ascii_8(simple_str), simple_str, 'check ascii->utf8 on simple string')
test:is(c_8_ascii(simple_str), simple_str, 'check utf8->ascii on simple string')
local c16be_8 = iconv.new('UTF-16BE', 'UTF-8')
local c8_16be = iconv.new('UTF-8', 'UTF-16BE')
test:is(c16be_8(c8_16be(simple_str)), simple_str,
'UTF conversion with ascii string')
test:is(c8_16be(c16be_8(cyrillic_str)), cyrillic_str,
'UTF conversion with non-ascii symbols')
local c16_16be = iconv.new('UTF-16', 'UTF-16BE')
local c1251_16 = iconv.new('WINDOWS-1251', 'UTF-16')
local c8_1251 = iconv.new('UTF-8', 'WINDOWS-1251')
test:is(c8_16be(c16be_8(cyrillic_str)), cyrillic_str,
'UTF conversion with non-ascii symbols')
-- test complex converting path
test:is(c8_1251(c1251_16(c16_16be(c16be_8(cyrillic_str)))), cyrillic_str,
'complex multi-format conversion')
-- test huge string
huge_str = string.rep(cyrillic_str, 50)
test:is(c16be_8(c8_16be(huge_str)), huge_str, "huge string")
local stat, err = pcall(iconv.new, 'NOT EXISTS', 'UTF-8')
test:is(stat, false, 'error was thrown on bad encoding')
test:ok(err:match('Invalid') ~= nil, 'correct error')
local stat, err = pcall(c_ascii_8, cyrillic_str)
test:is(stat, false, 'error was thrown on sequence')
test:ok(err:match('Incomplete multibyte sequence') ~= nil, 'correct error')
os.exit(test:check() == true and 0 or 1)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment