From 5f596b25b061c01e3ca11bbfd59912594bb8a617 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov <vdavydov@tarantool.org> Date: Wed, 5 Apr 2023 18:25:27 +0300 Subject: [PATCH] lua: add internal xml parser We need an XML parser to parse the string returned by malloc_info. This commit implements a simple parser in Lua C that transforms an XML string into a Lua table. The parser is available both from C (luaT_xml_decode) and Lua (internal.xml.decode). Needed for #7311 NO_DOC=internal NO_CHANGELOG=internal --- src/CMakeLists.txt | 1 + src/lua/init.c | 2 + src/lua/xml.c | 324 ++++++++++++++++++++++++++++++++++ src/lua/xml.h | 62 +++++++ test/app-luatest/xml_test.lua | 144 +++++++++++++++ 5 files changed, 533 insertions(+) create mode 100644 src/lua/xml.c create mode 100644 src/lua/xml.h create mode 100644 test/app-luatest/xml_test.lua diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f4229d7af7..8f00d69170 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -217,6 +217,7 @@ set (server_sources lua/backtrace.c lua/builtin_modcache.c lua/tweaks.c + lua/xml.c ${lua_sources} ${PROJECT_SOURCE_DIR}/third_party/lua-yaml/lyaml.cc ${PROJECT_SOURCE_DIR}/third_party/lua-yaml/b64.c diff --git a/src/lua/init.c b/src/lua/init.c index 5869ee8f2b..aba0160d14 100644 --- a/src/lua/init.c +++ b/src/lua/init.c @@ -70,6 +70,7 @@ #include "lua/uri.h" #include "lua/builtin_modcache.h" #include "lua/tweaks.h" +#include "lua/xml.h" #include "digest.h" #include "errinj.h" @@ -889,6 +890,7 @@ tarantool_lua_init(const char *tarantool_bin, const char *script, int argc, tarantool_lua_uri_init(L); tarantool_lua_utf8_init(L); tarantool_lua_utils_init(L); + tarantool_lua_xml_init(L); tarantool_lua_fiber_init(L); tarantool_lua_fiber_cond_init(L); tarantool_lua_fiber_channel_init(L); diff --git a/src/lua/xml.c b/src/lua/xml.c new file mode 100644 index 0000000000..bc5a516b58 --- /dev/null +++ b/src/lua/xml.c @@ -0,0 +1,324 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright 2010-2023, Tarantool AUTHORS, please see AUTHORS file. + */ +#include "lua/xml.h" + +#include <assert.h> +#include <ctype.h> +#include <lua.h> +#include <lauxlib.h> +#include <stddef.h> +#include <string.h> + +#include "lua/utils.h" + +int +luaT_xml_decode(struct lua_State *L) +{ + /* The input string must be at the top of the stack. */ + int top = lua_gettop(L); + if (top < 1 || lua_type(L, top) != LUA_TSTRING) + return luaL_error(L, "expected string"); + const char *s = lua_tostring(L, top); + assert(s != NULL); + /* Push the document table. */ + lua_newtable(L); + enum { + ELEM, + TAG, + START_TAG, + END_TAG, + SPACE_AFTER_END_TAG, + ATTR, + ATTR_NAME, + ATTR_VALUE_SEP, + ATTR_VALUE_BEGIN, + ATTR_VALUE, + ATTR_VALUE_END, + ELEM_END, + DOC_END, + } state = ELEM; + const char *tag_name = NULL; + const char *attr_name = NULL; + const char *attr_value = NULL; + /* + * Position of the current character in the input string recorded + * as line and column numbers. Used for error reporting. + */ + int line = 1; + int column = 1; +#define ERROR(msg) \ + luaL_error(L, "XML decode error at line %d, column %d: %s", \ + line, column, (msg)) + while (*s != '\0') { + /* + * State machine: + * - 'break' skips the current character by leaving + * the switch-case. + * - 'continue' rechecks the state without skipping + * the current character. + */ + switch (state) { + /* + * Expected an element. Currently, only tags are supported + * (<section> or </section>) while values enclosed in tags are + * considered invalid. Skip optional spaces and '<', then + * transition to TAG. + * + * TODO: Handle values, such as <section>value</section>. + */ + case ELEM: + if (isspace(*s)) + break; /* skip spaces */ + if (*s != '<') + return ERROR("invalid token"); + state = TAG; + break; /* skip '<' */ + /* + * If the current character is '/', skip it and transition to + * END_TAG, otherwise transition to START_TAG. + */ + case TAG: + if (*s != '/') { + tag_name = s; + state = START_TAG; + continue; + } + if (lua_gettop(L) == top + 1) { + /* + * The stack only contains the resulting + * document table, which means there's no + * start tag matching this end tag. + */ + return ERROR("invalid token"); + } + tag_name = s + 1; + state = END_TAG; + break; /* skip '/' */ + /* + * Skip the start tag name, create a table for the tag + * attributes and child elements, then transition to ATTR. + */ + case START_TAG: { + assert(tag_name != NULL); + /* TODO: Check names according to XML standard. */ + if (isalnum(*s)) + break; /* skip the tag name */ + if ((!isspace(*s) && *s != '/' && *s != '>') || + s == tag_name) + return ERROR("invalid token"); + /* Stack: parent table, ... */ + assert(lua_gettop(L) >= top + 1); + assert(lua_istable(L, -1)); + lua_pushlstring(L, tag_name, s - tag_name); + /* + * Check that there's no attributes with the same name + * in the parent table. A table is accepted because + * there may be more than one element with the same + * name (elements are stored in an array). + */ + lua_pushvalue(L, -1); /* tag name */ + lua_rawget(L, -3); + int type = lua_type(L, -1); + if (type != LUA_TNIL && type != LUA_TTABLE) + return ERROR("duplicate name"); + /* + * Create an array entry in the parent table under this + * name if it doesn't exist. + */ + if (type == LUA_TNIL) { + lua_pop(L, 1); + lua_newtable(L); + lua_pushvalue(L, -2); /* tag name */ + lua_pushvalue(L, -2); /* new array */ + lua_rawset(L, -5); + } + assert(lua_istable(L, -1)); + int len = lua_objlen(L, -1); + /* + * Create a new attribute table for this element and + * append it to the array. Then pop the array leaving + * the attribute table and the new tag name at the top + * of the stack. + */ + lua_newtable(L); + lua_pushvalue(L, -1); /* attribute table */ + lua_rawseti(L, -3, len + 1); + lua_replace(L, -2); + state = ATTR; + continue; + } + /* + * Skip the end tag name, check it against the corresponding + * start tag name, and transition to SPACE_AFTER_END_TAG. + */ + case END_TAG: { + assert(tag_name != NULL); + if (isalnum(*s)) + break; /* skip the tag name */ + if ((!isspace(*s) && *s != '>') || + s == tag_name) + return ERROR("invalid token"); + /* Stack: attr table, tag name, parent table, ... */ + assert(lua_gettop(L) >= top + 3); + /* Check that the start and end tag names match. */ + size_t len; + const char *expected = lua_tolstring(L, -2, &len); + assert(expected != NULL); + if ((size_t)(s - tag_name) != len || + strncmp(tag_name, expected, len) != 0) + return ERROR("mismatched tag"); + state = SPACE_AFTER_END_TAG; + continue; + } + /* + * Skip optional spaces after the end tag name and transition + * to ELEM_END. + */ + case SPACE_AFTER_END_TAG: + if (isspace(*s)) + break; /* skip spaces */ + state = ELEM_END; + continue; + /* + * Skip optional spaces then transition to ELEM, ELEM_END, or + * ATTR_NAME, depending on the current character. + */ + case ATTR: + if (isspace(*s)) + break; /* skip spaces */ + if (*s == '/') { + state = ELEM_END; + break; /* skip '/' */ + } else if (*s == '>') { + state = ELEM; + break; /* skip '>' */ + } + attr_name = s; + state = ATTR_NAME; + continue; + /* + * Skip the attribute name and transition to ATTR_VALUE_SEP. + */ + case ATTR_NAME: + assert(attr_name != NULL); + /* TODO: Check names according to XML standard. */ + if (isalnum(*s)) + break; /* skip the attribute name */ + if ((!isspace(*s) && *s != '=') || + s == attr_name) + return ERROR("invalid token"); + /* Stack: attr table, tag name, parent table, ... */ + assert(lua_gettop(L) >= top + 3); + lua_pushlstring(L, attr_name, s - attr_name); + /* Check that there's no duplicate attributes. */ + lua_pushvalue(L, -1); /* attribute name */ + lua_rawget(L, -3); + if (!lua_isnil(L, -1)) + return ERROR("duplicate name"); + lua_pop(L, 1); + state = ATTR_VALUE_SEP; + continue; + /* + * Skip optional spaces and '=' separating the attribute value + * from the name, then transition to ATTR_VALUE_BEGIN. + */ + case ATTR_VALUE_SEP: + if (isspace(*s)) + break; /* skip spaces */ + if (*s != '=') + return ERROR("invalid token"); + state = ATTR_VALUE_BEGIN; + break; /* skip '=' */ + /* + * Skip optional spaces and '"' preceding the attribute value, + * then transition to ATTR_VALUE. + */ + case ATTR_VALUE_BEGIN: + if (isspace(*s)) + break; /* skip spaces */ + if (*s != '"') + return ERROR("invalid token"); + attr_value = s + 1; + state = ATTR_VALUE; + break; /* skip '"' */ + /* + * Skip until and including '"' following the attribute value, + * insert the new attribute to the attribute table, then + * transition to ATTR_VALUE_END. + */ + case ATTR_VALUE: + assert(attr_value != NULL); + /* TODO: Handle escaped quotation marks. */ + if (*s != '"') + break; /* skip until '"' */ + /* + * Stack: attr name, attr table, tag name, + * parent table, ... + */ + assert(lua_gettop(L) >= top + 4); + /* Insert the new attribute into the table. */ + lua_pushlstring(L, attr_value, s - attr_value); + lua_rawset(L, -3); + state = ATTR_VALUE_END; + break; /* skip '"' */ + /* + * Check that the attribute value is followed by a valid token, + * then transition to ATTR. + */ + case ATTR_VALUE_END: + if (!isspace(*s) && *s != '/' && *s != '>') + return ERROR("invalid token"); + state = ATTR; + continue; + /* + * Skip '>' terminating the current element, pop the attribute + * table and the tag name created for the element from the + * stack, then transition to ELEM or DOC_END, depending on + * whether we expect more elements. + */ + case ELEM_END: + if (*s != '>') + return ERROR("invalid token"); + /* Stack: attr table, tag name, parent table, ... */ + assert(lua_gettop(L) >= top + 3); + lua_pop(L, 2); + state = lua_gettop(L) == top + 1 ? DOC_END : ELEM; + break; /* skip '>' */ + /* + * End of document. No input except for spaces is expected in + * this state. + */ + case DOC_END: + if (isspace(*s)) + break; /* skip spaces */ + return ERROR("junk after document"); + } + if (*s == '\n') { + line++; + column = 1; + } else { + column++; + } + s++; + } + if (state != DOC_END) + return ERROR("truncated document"); +#undef ERROR + /* Replace the input string with the document table. */ + lua_replace(L, -2); + return 1; +} + +void +tarantool_lua_xml_init(struct lua_State *L) +{ + const struct luaL_Reg module_funcs[] = { + {"decode", luaT_xml_decode}, + {NULL, NULL}, + }; + luaT_newmodule(L, "internal.xml", module_funcs); + lua_pop(L, 1); +} diff --git a/src/lua/xml.h b/src/lua/xml.h new file mode 100644 index 0000000000..5e7ba706af --- /dev/null +++ b/src/lua/xml.h @@ -0,0 +1,62 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright 2010-2023, Tarantool AUTHORS, please see AUTHORS file. + */ +#pragma once + +#if defined(__cplusplus) +extern "C" { +#endif /* defined(__cplusplus) */ + +struct lua_State; + +/** + * Takes a string that is supposed to contain a valid XML document, decodes it, + * and replaces the string with a Lua table representation of the XML document. + * Raises a Lua error on failure. On success returns 1. + * + * Each XML element (including an input document) is represented by a Lua table. + * An attribute is stored in the table as a string keyed by the attribute name + * while a sub-element is stored in an array keyed by the sub-element tag. + * + * For example, the following document + * + * <section version="1"> + * <element value="foo"/> + * <element value="bar"/> + * </section> + * + * will be transformed to + * + * { + * section = { + * [1] = { + * version = '1', + * element = { + * [1] = {value = 'foo'}, + * [2] = {value = 'bar'}, + * } + * } + * } + * } + * + * Spaces and new lines in the input string are ignored. + * + * Limitations: + * - Element values, such as <section>value</section>, aren't supported. + * - Escape sequences in attribute values aren't supported. + * - Tag and attribute names aren't checked according to the XML standard. + * The parser allows only digits and letters while in XML a name may also + * contain dots, dashes, and underscores, and must start with a letter or + * an underscore. + */ +int +luaT_xml_decode(struct lua_State *L); + +void +tarantool_lua_xml_init(struct lua_State *L); + +#if defined(__cplusplus) +} /* extern "C" */ +#endif /* defined(__cplusplus) */ diff --git a/test/app-luatest/xml_test.lua b/test/app-luatest/xml_test.lua new file mode 100644 index 0000000000..062356ffb2 --- /dev/null +++ b/test/app-luatest/xml_test.lua @@ -0,0 +1,144 @@ +local xml = require('internal.xml') + +local t = require('luatest') +local g = t.group() + +g.test_invalid_arg = function() + local errmsg + local function test(...) + t.assert_error_msg_equals(errmsg, xml.decode, ...) + end + + errmsg = 'expected string' + test() + test(123) + test(false) +end + +g.test_invalid_input = function() + local errmsg + local function test(input, line, column) + t.assert_error_msg_equals( + string.format('XML decode error at line %d, column %d: %s', + line, column, errmsg), + xml.decode, input) + end + + errmsg = 'truncated document' + test('', 1, 1) + test('<', 1, 2) + test('<foo', 1, 5) + test('<foo/', 1, 6) + test('<foo>', 1, 6) + test('<foo><', 1, 7) + test('<foo></', 1, 8) + test('<foo bar', 1, 9) + test('<foo bar ', 1, 10) + test('<foo bar=', 1, 10) + test('<foo bar="', 1, 11) + test('<foo bar=""', 1, 12) + test('<foo bar=""/', 1, 13) + test('<foo bar=""/', 1, 13) + test('<foo><bar/>', 1, 12) + test('<foo><bar></bar>', 1, 17) + test('<foo>\n<bar>\n</bar>', 3, 7) + + errmsg = 'junk after document' + test('<foo></foo><bar/>', 1, 12) + test('<foo></foo>\n<bar></bar>', 2, 1) + + errmsg = 'invalid token' + test('foo', 1, 1) + test('"foo"', 1, 1) + test('>foo', 1, 1) + test('/foo', 1, 1) + test('=foo', 1, 1) + test('<>', 1, 2) + test('<=>', 1, 2) + test('</foo>', 1, 2) + test('<"foo"/>', 1, 2) + test('< foo/>', 1, 2) + test('<foo=/>', 1, 5) + test('<foo"bar"/>', 1, 5) + test('<foo bar/>', 1, 9) + test('<foo bar />', 1, 10) + test('<foo bar=/>', 1, 10) + test('<foo bar=1/>', 1, 10) + test('<foo bar="1""2"/>', 1, 13) + test('<foo bar="1"<"2"/>', 1, 13) + test('<foo bar="1"baz="2"/>', 1, 13) + test('<foo bar="1" baz="2"/ >', 1, 22) + test('<foo bar="1" baz="2"/=>', 1, 22) + test('<foo bar="1" baz="2"/<>', 1, 22) + test('<foo bar="1" baz="2"/foo>', 1, 22) + test('<foo>bar</foo>', 1, 6) + test('<foo></foo="1">', 1, 11) + test('<foo></foo bar="1">', 1, 12) + test('<foo\nbar="1"\nbaz=2/>', 3, 5) + + errmsg = 'mismatched tag' + test('<foo></bar>', 1, 11) + test('<foo>\n<bar/>\n</bar>', 3, 6) + test('<foo><bar>\n</foo></bar>', 2, 6) + + errmsg = 'duplicate name' + test('<foo bar="1" bar="2"/>', 1, 17) + test('<foo bar="1">\n<bar/>\n</foo>', 2, 5) + test('<foo bar="1">\n<foo/>\n<bar/>\n</foo>', 3, 5) +end + +g.test_decode = function() + local expected + local function test(input) + t.assert_equals(xml.decode(input), expected) + end + + expected = {foo = {{}}} + test('<foo/>') + test(' <foo /> ') + test('\n<foo\n/> ') + test('<foo></foo>') + test('<foo > </foo >') + test('<foo\n>\n</foo\n>') + + expected = {foo = {{bar = "123"}}} + test('<foo bar="123"/>') + test('<foo bar = "123"/>') + test('<foo bar\n=\n"123"/>') + test('<foo bar="123"></foo>') + + expected = {foo = {{bar = "123", baz = "xyz"}}} + test('<foo bar="123" baz="xyz"/>') + test('<foo bar="123" baz="xyz"></foo>') + + expected = {foo = {{bar = {{}}}}} + test('<foo><bar/></foo>') + test('<foo> <bar/> </foo>') + test('<foo>\n<bar/>\n</foo>') + test('<foo><bar></bar></foo>') + test('<foo> <bar> </bar> </foo>') + test('<foo>\n<bar>\n</bar>\n</foo>') + + expected = {foo = {{bar = {{}, {}}}}} + test('<foo><bar/><bar/></foo>') + test('<foo><bar/><bar></bar></foo>') + test('<foo><bar></bar><bar></bar></foo>') + + expected = {foo = {{bar = {{buz = "1"}, {buz = "2"}}}}} + test('<foo><bar buz="1"/><bar buz="2"/></foo>') + test('<foo><bar buz="1"/><bar buz="2"></bar></foo>') + test('<foo><bar buz="1"></bar><bar buz="2"></bar></foo>') + + expected = {foo = {{bar = {{}}, baz = {{}}}}} + test('<foo><bar/><baz/></foo>') + test('<foo><bar/><baz></baz></foo>') + test('<foo><bar></bar><baz></baz></foo>') + + expected = {foo = {{bar = {{baz = {{}}}}}}} + test('<foo><bar><baz/></bar></foo>') + test('<foo><bar><baz></baz></bar></foo>') + + expected = {foo = {{bar = "xyz", baz = {{}}}}} + test('<foo bar="xyz"><baz/></foo>') + test('<foo bar="xyz"><baz></baz></foo>') +end -- GitLab