diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 78094f686fc9f018c3ddcd264fa6d894bd098950..117879382ef8b0c52aacfec60eb71ee2e7cd3745 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -29,6 +29,7 @@ lua_source(lua_sources lua/help.lua) lua_source(lua_sources lua/help_en_US.lua) lua_source(lua_sources lua/tap.lua) lua_source(lua_sources lua/fio.lua) +lua_source(lua_sources lua/csv.lua) lua_source(lua_sources lua/strict.lua) lua_source(lua_sources ../third_party/luafun/fun.lua) # LuaJIT jit.* library @@ -121,7 +122,7 @@ set_source_files_compile_flags(${server_sources}) add_library(server STATIC ${server_sources}) target_link_libraries(server core) -set (common_libraries server core misc bitset msgpuck) +set (common_libraries server core misc bitset msgpuck csv) list(APPEND common_libraries ${LIBEIO_LIBRARIES} diff --git a/src/ffisyms.cc b/src/ffisyms.cc index 980d279380b05beb40384c44fd1a1d4688fa128e..f385ddbc7d4b78cee5f68f6c262488f0a0cfe0bd 100644 --- a/src/ffisyms.cc +++ b/src/ffisyms.cc @@ -20,6 +20,7 @@ #include "iobuf.h" #include <lib/salad/guava.h> #include "latch.h" +#include <lib/csv/csv.h> /* * A special hack to cc/ld to keep symbols in an optimized binary. @@ -93,5 +94,11 @@ void *ffi_symbols[] = { (void *) box_latch_delete, (void *) box_latch_lock, (void *) box_latch_trylock, - (void *) box_latch_unlock + (void *) box_latch_unlock, + (void *) csv_create, + (void *) csv_destroy, + (void *) csv_setopt, + (void *) csv_iterator_create, + (void *) csv_next, + (void *) csv_feed, }; diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index c59a1cc460787153c817240a92e548482e9ea3df..352602bbc7c3dbb720bb1a395e52e9be715a60ef 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -2,4 +2,5 @@ add_subdirectory(bit) add_subdirectory(bitset) add_subdirectory(small) add_subdirectory(salad) +add_subdirectory(csv) add_library(msgpuck STATIC msgpuck/msgpuck.c) diff --git a/src/lib/csv/CMakeLists.txt b/src/lib/csv/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..3580e4da2f47bcb6f9943b9f76bd8f768574be4c --- /dev/null +++ b/src/lib/csv/CMakeLists.txt @@ -0,0 +1,6 @@ +set(lib_sources + csv.c +) + +set_source_files_compile_flags(${lib_sources}) +add_library(csv STATIC ${lib_sources}) diff --git a/src/lib/csv/csv.c b/src/lib/csv/csv.c new file mode 100644 index 0000000000000000000000000000000000000000..d4bf2ea1696be4a0a1ccc607f4f6d9cdc4e355af --- /dev/null +++ b/src/lib/csv/csv.c @@ -0,0 +1,377 @@ +/* + * Copyright (C) 2010-2015 Tarantool AUTHORS: please see AUTHORS file. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "csv.h" + +#include <ctype.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <stdarg.h> +#include <stdbool.h> + +void csv_emit_row_empty(void *ctx) +{ + (void) ctx; +} + +void csv_emit_field_empty(void *ctx, const char *field, const char *end) +{ + (void) ctx; + (void) field; + (void) end; +} + +void +csv_create(struct csv *csv) +{ + memset(csv, 0, sizeof(struct csv)); + csv->delimiter= ','; + csv->quote_char = '\"'; + csv->realloc = realloc; + csv->emit_field = csv_emit_field_empty; + csv->emit_row = csv_emit_row_empty; +} + +void +csv_destroy(struct csv *csv) +{ + if (csv->buf) { + csv->realloc(csv->buf, 0); + csv->buf = NULL; + } +} + +int +csv_isvalid(struct csv *csv) +{ + if (csv->error_status == CSV_ER_OK && csv->state == CSV_IN_QUOTES) + csv->error_status = CSV_ER_INVALID; + return !csv->error_status; +} + +int +csv_get_error_status(struct csv *csv) +{ + return csv->error_status; +} + +void +csv_setopt(struct csv *csv, int opt, ...) +{ + va_list args; + va_start(args, opt); + switch(opt) { + case CSV_OPT_DELIMITER: + csv->delimiter = va_arg(args, int); + break; + case CSV_OPT_QUOTE: + csv->quote_char = va_arg(args, int); + break; + case CSV_OPT_REALLOC: + csv->realloc = va_arg(args, void* (*)(void*, size_t)); + break; + case CSV_OPT_EMIT_FIELD: + csv->emit_field = va_arg(args, csv_emit_field_t); + case CSV_OPT_EMIT_ROW: + csv->emit_row = va_arg(args, csv_emit_row_t); + case CSV_OPT_EMIT_CTX: + csv->emit_ctx = va_arg(args, void*); + } + va_end(args); +} + +/** + * both of methods (emitting and iterating) are implementing by one function + * firstonly == true means iteration method. + * @return unprocessed tail + **/ +const char * +csv_parse_impl(struct csv *csv, const char *s, const char *end, bool firstonly) +{ + if (end - s == 0) + return NULL; + assert(end - s > 0); + assert(csv->emit_field); + assert(csv->emit_row); + for (const char *p = s; p != end; p++) { + bool is_line_end = (*p == '\n' || *p == '\r'); + /* realloc buffer */ + if (csv->buf == NULL || + (csv->bufp && csv->buf_len < csv->bufp - csv->buf + 1)) { + size_t new_size = csv->buf_len * 2; + if (csv->buf_len == 0 || csv->buf == NULL) + new_size = 256; + char *new_buf = (char *)csv->realloc(csv->buf, new_size); + if (new_buf == NULL) { + csv->error_status = CSV_ER_MEMORY_ERROR; + return NULL; + } + csv->buf_len = new_size; + csv->bufp = csv->bufp - csv->buf + new_buf; + csv->buf = new_buf; + } + /* \r\n (or \n\r) linebreak, not in quotes */ + if (is_line_end && csv->state != CSV_IN_QUOTES && + *p != csv->prev_symbol && + (csv->prev_symbol == '\n' || csv->prev_symbol == '\r')) { + csv->prev_symbol = '\0'; + continue; + } + csv->prev_symbol = *p; + /* 2 switches to avoid code dublicates */ + switch (csv->state) { + case CSV_LEADING_SPACES: + csv->bufp = csv->buf; + if (*p == ' ') /* skip spaces */ + continue; + csv->state = CSV_OUT_OF_QUOTES; + /* symbol not handled, continue to the next switch */ + break; + case CSV_QUOTE_OPENING: + if (*p == csv->quote_char) { + /* double-quote "" */ + *csv->bufp++ = csv->quote_char; + csv->state = CSV_OUT_OF_QUOTES; + continue; + } + csv->state = CSV_IN_QUOTES; + /* symbol not handled, continue to the next switch */ + break; + case CSV_QUOTE_CLOSING: + if (*p == csv->quote_char) { + /* double-quote "" */ + *csv->bufp++ = csv->quote_char; + csv->state = CSV_IN_QUOTES; + continue; + } + csv->state = CSV_OUT_OF_QUOTES; + /* symbol not handled, continue to the next switch */ + break; + } + + switch (csv->state) { + case CSV_OUT_OF_QUOTES: + if (is_line_end || *p == csv->delimiter) { + /* end of field */ + csv->state = CSV_LEADING_SPACES; + csv->bufp -= csv->ending_spaces; + if (firstonly) { + csv->state = CSV_NEWFIELD; + return p; + } else { + csv->emit_field(csv->emit_ctx, + csv->buf, csv->bufp); + } + + csv->bufp = csv->buf; + + } else if (*p == csv->quote_char) { + csv->state = CSV_QUOTE_OPENING; + } else { + *csv->bufp++ = *p; + } + + if (*p == ' ') { + csv->ending_spaces++; + } else { + csv->ending_spaces = 0; + } + if (is_line_end) { + /* + * bufp == buf means an empty field, + * but bufp == 0 means no field at the moment, + * it may be an end of the line or file + */ + csv->bufp = 0; + csv->emit_row(csv->emit_ctx); + } + break; + case CSV_IN_QUOTES: + if (*p == csv->quote_char) { + csv->state = CSV_QUOTE_CLOSING; + } else { + *csv->bufp++ = *p; + } + break; + case CSV_NEWFIELD: + csv->state = CSV_LEADING_SPACES; + if (is_line_end) { + csv->bufp = 0; + if (p + 1 == end) + return NULL; + else + return p + 1; + + } + break; + } + } + return end; +} + + +void +csv_parse_chunk(struct csv *csv, const char *s, const char *end) { + csv_parse_impl(csv, s, end, false); +} + +void +csv_finish_parsing(struct csv *csv) +{ + if (csv_isvalid(csv)){ + if (csv->bufp) { + csv->bufp -= csv->ending_spaces; + csv->emit_field(csv->emit_ctx, + csv->buf, csv->bufp); + csv->emit_row(csv->emit_ctx); + } + if (csv->buf) + csv->realloc(csv->buf, 0); + csv->bufp = NULL; + csv->buf = NULL; + csv->buf_len = 0; + } +} + + +void +csv_iterator_create(struct csv_iterator *it, struct csv *csv) +{ + memset(it, 0, sizeof(struct csv_iterator)); + it->csv = csv; +} + +/** + * next iteration step + **/ +int +csv_next(struct csv_iterator *it) +{ + it->field = NULL; + it->field_len = 0; + if (it->buf_begin == NULL) /* buffer isn't set */ + return CSV_IT_NEEDMORE; + /** + * length of buffer is zero + * it means end of file, but if there is no \n + * function must emit last field, EOL and EOF. + **/ + if (it->buf_begin == it->buf_end) { + /** bufp == buf means empty field, + * but bufp == 0 means no field at the moment, it may be + * end of line or end of file + **/ + if (it->csv->bufp == NULL) { /* nothing to emit, end of file */ + return CSV_IT_EOF; + } + if (!it->csv->error_status && !csv_isvalid(it->csv)) { + it->csv->realloc(it->csv->buf, 0); + it->csv->buf = NULL; + it->csv->bufp = NULL; + it->csv->buf_len = 0; + return CSV_IT_ERROR; + } + + if (it->csv->state != CSV_END_OF_LAST_LINE) { /* last field */ + it->csv->state = CSV_END_OF_LAST_LINE; + it->csv->bufp -= it->csv->ending_spaces; + it->field = it->csv->buf; + it->field_len = it->csv->bufp - it->csv->buf; + it->csv->bufp = it->csv->buf; + return CSV_IT_OK; + } + if (it->csv->state == CSV_END_OF_LAST_LINE) { /* last line */ + it->csv->realloc(it->csv->buf, 0); + it->csv->buf = NULL; + it->csv->bufp = NULL; + it->csv->buf_len = 0; + return CSV_IT_EOL; + } + + } + const char *tail = csv_parse_impl(it->csv, it->buf_begin, + it->buf_end, true); + + if (csv_get_error_status(it->csv) == CSV_ER_MEMORY_ERROR) + return CSV_IT_ERROR; + + it->buf_begin = tail; + /* bufp == NULL means end of line */ + if (it->csv->bufp == NULL) + return CSV_IT_EOL; + + if (tail == it->buf_end) /* buffer is empty */ + return CSV_IT_NEEDMORE; + + /* return field via iterator structure */ + it->field = it->csv->buf; + it->field_len = it->csv->bufp - it->csv->buf; + return CSV_IT_OK; +} + +void +csv_feed(struct csv_iterator *it, const char *buf, size_t buf_len) +{ + it->buf_begin = buf; + it->buf_end = buf + buf_len; +} + +size_t +csv_escape_field(struct csv *csv, const char *field, + size_t field_len, char *dst, size_t buf_size) +{ + char *p = dst; + int inquotes = 0; + /* surround quotes, only if there is delimiter \n or \r */ + if (memchr(field, csv->delimiter, field_len) || + memchr(field, '\n', field_len) || + memchr(field, '\r', field_len)) { + inquotes = 1; + *p++ = csv->quote_char; + } + while (*field) { + /* double-quote "" */ + if (*field == csv->quote_char) { + assert(p - dst < buf_size); + *p++ = csv->quote_char; + } + assert(p - dst < buf_size); + *p++ = *field++; + } + /* adds ending quote */ + if (inquotes) { + assert(p - dst < buf_size); + *p++ = csv->quote_char; + } + *p = 0; + return p - dst; +} diff --git a/src/lib/csv/csv.h b/src/lib/csv/csv.h new file mode 100644 index 0000000000000000000000000000000000000000..ab7ecd569b4ea242da8968d05a019423e908d216 --- /dev/null +++ b/src/lib/csv/csv.h @@ -0,0 +1,183 @@ +#ifndef TARANTOOL_CSV_H_INCLUDED +#define TARANTOOL_CSV_H_INCLUDED +/* + * Copyright (C) 2010-2015 Tarantool AUTHORS: please see AUTHORS file. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include <stdio.h> + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef void (*csv_emit_row_t)(void *ctx); +typedef void (*csv_emit_field_t)(void *ctx, const char *field, const char *end); + +struct csv +{ + void *emit_ctx; + csv_emit_row_t emit_row; + csv_emit_field_t emit_field; + char delimiter; + char quote_char; + + char prev_symbol; + int error_status; + int ending_spaces; + + void *(*realloc)(void*, size_t); + + int state; + char *buf; + char *bufp; + size_t buf_len; +}; + +enum csv_parser_option { + CSV_OPT_DELIMITER, + CSV_OPT_QUOTE, + CSV_OPT_REALLOC, + CSV_OPT_EMIT_FIELD, + CSV_OPT_EMIT_ROW, + CSV_OPT_EMIT_CTX +}; + +enum csv_iteraion_state { + CSV_IT_OK, + CSV_IT_EOL, + CSV_IT_NEEDMORE, + CSV_IT_EOF, + CSV_IT_ERROR +}; + +enum csv_parser_state { + CSV_LEADING_SPACES, + CSV_OUT_OF_QUOTES, + CSV_IN_QUOTES, + CSV_QUOTE_OPENING, + CSV_QUOTE_CLOSING, + CSV_LINE_BREAKING, + CSV_NEWFIELD, + CSV_END_OF_LAST_LINE +}; + +enum csv_error_status { + CSV_ER_OK, + CSV_ER_INVALID, + CSV_ER_MEMORY_ERROR +}; + +void +csv_create(struct csv *csv); + +void +csv_destroy(struct csv *csv); + +/** + * Set a parser option. + */ +void +csv_setopt(struct csv *csv, int opt, ...); + +/** + * Parse input and call emit_row/emit_line. + * Save tail to inside buffer, + * next call will concatenate tail and string from args + */ +void +csv_parse_chunk(struct csv *csv, const char *s, const char *end); + +/** + * emits all remaining symbols from buffer + */ +void +csv_finish_parsing(struct csv *csv); + +/** + * @return 0 is ok + */ +int +csv_get_error_status(struct csv *csv); + +/** + * @brief The csv_iterator struct allows iterate field by field through csv + */ +struct csv_iterator { + struct csv *csv; + + const char *buf_begin; //input buffer + const char *buf_end; + + const char *field; //output buffer + size_t field_len; +}; + +void +csv_iterator_create(struct csv_iterator *it, struct csv *csv); +/** + * Receives next element from csv + * element is field or end of line + * @return iteration state + */ +int +csv_next(struct csv_iterator *); + +/** + * @brief csv_feed delivers buffer to iterator + * empty buffer means end of iteration + */ +void +csv_feed(struct csv_iterator *it, const char *buf, size_t buf_len); + +/** + * @brief csv_escape_field prepares field to out in file. + * Adds pair quote and if there is comma or linebreak in field, adds surrounding quotes. + * At worst escaped field will 2 times more symbols than input field. + * @return length of escaped field or -1 if not enough space in buffer. + */ +size_t +csv_escape_field(struct csv *csv, const char *field, size_t field_len, char *dst, size_t dst_size); + + +static inline const char * +csv_iterator_get_field(struct csv_iterator *it) +{ + return it->field; +} + +static inline size_t +csv_iterator_get_field_len(struct csv_iterator *it) +{ + return it->field_len; +} +#if defined(__cplusplus) +} +#endif /* extern "C" */ +#endif + diff --git a/src/lua/csv.lua b/src/lua/csv.lua new file mode 100644 index 0000000000000000000000000000000000000000..bb1d8d28d4b8c04de018c8679fd8a8b9b8d3d1ad --- /dev/null +++ b/src/lua/csv.lua @@ -0,0 +1,222 @@ +-- csv.lua (internal file) + +local ffi = require('ffi') +local log = require('log') + +ffi.cdef[[ + typedef void (*csv_emit_row_t)(void *ctx); + typedef void (*csv_emit_field_t)(void *ctx, const char *field, const char *end); + + struct csv + { + void *emit_ctx; + csv_emit_row_t emit_row; + csv_emit_field_t emit_field; + char delimiter; + char quote_char; + + char prev_symbol; + int error_status; + int ending_spaces; + + void *(*realloc)(void*, size_t); + + int state; + char *buf; + char *bufp; + size_t buf_len; + }; + + void csv_create(struct csv *csv); + void csv_destroy(struct csv *csv); + void csv_setopt(struct csv *csv, int opt, ...); + + struct csv_iterator { + struct csv *csv; + const char *buf_begin; + const char *buf_end; + const char *field; + size_t field_len; + }; + void csv_iterator_create(struct csv_iterator *it, struct csv *csv); + int csv_next(struct csv_iterator *); + void csv_feed(struct csv_iterator *, const char *, size_t); + size_t csv_escape_field(struct csv *csv, const char *field, size_t field_len, char *dst, size_t buf_size); + enum { + CSV_IT_OK, + CSV_IT_EOL, + CSV_IT_NEEDMORE, + CSV_IT_EOF, + CSV_IT_ERROR + }; +]] + +local iter = function(csvstate, i) + local readable = csvstate[1] + local csv_chunk_size = csvstate[2] + local csv = csvstate[3] + local it = csvstate[4] + local tup = {} + local st = ffi.C.csv_next(it) + while st ~= ffi.C.CSV_IT_EOF do + if st == ffi.C.CSV_IT_NEEDMORE then + if readable then + local buf = readable:read(csv_chunk_size) + ffi.C.csv_feed(it, buf, string.len(buf)) + else + ffi.C.csv_feed(it, "", 0) + end + elseif st == ffi.C.CSV_IT_EOL then + i = i + 1 + if i > 0 then + return i, tup + end + elseif st == ffi.C.CSV_IT_OK then + if i >= 0 then + tup[#tup + 1] = ffi.string(it.field, it.field_len) + end + elseif st == ffi.C.CSV_IT_ERROR then + log.warn("CSV file has errors") + break + elseif st == ffi.C.CSV_IT_EOF then + ffi.C.csv_destroy(csv) + break + end + st = ffi.C.csv_next(it) + end +end + +local module = {} + +--@brief parse csv string by string +--@param readable must be string or object with method read(num) returns string +--@param opts.chunk_size (default 4096). Parser will read by chunk_size symbols +--@param opts.delimiter (default ','). +--@param opts.quote_char (default '"'). +--@param opts.skip_head_lines (default 0). Skip header. +--@return iter function, iterator state +module.iterate = function(readable, opts) + opts = opts or {} + if type(readable) ~= "string" and type(readable.read) ~= "function" then + error("Usage: load(string or object with method read(num)" .. + "returns string)") + end + if not opts.chunk_size then + opts.chunk_size = 4096 + end + if not opts.delimiter then + opts.delimiter = ',' + end + if not opts.quote_char then + opts.quote_char = '"' + end + if not opts.skip_head_lines then + opts.skip_head_lines = 0 + end + local str + if type(readable) == "string" then + str = readable + readable = nil + else + str = readable:read(opts.chunk_size) + end + + if not str then --read not works + error("Usage: load(string or object with method read(num)" .. + "returns string)") + end + local it = ffi.new('struct csv_iterator') + local csv = ffi.new('struct csv') + ffi.C.csv_create(csv) + ffi.gc(csv, ffi.C.csv_destroy) + + csv.delimiter = string.byte(opts.delimiter) + csv.quote_char = string.byte(opts.quote_char) + ffi.C.csv_iterator_create(it, csv) + ffi.C.csv_feed(it, str, string.len(str)) + + return iter, {readable, opts.chunk_size, csv, it}, -opts.skip_head_lines +end + +--@brief parse csv and make table +--@return table +module.load = function(readable, opts) + opts = opts or {} + local result = {} + for i, tup in module.iterate(readable, opts) do + result[i] = tup + end + return result +end + +--@brief dumps tuple or table as csv +--@param t is tuple or table +--@param writable must be object with method write(string) like file or socket +--@param opts.delimiter (default ','). +--@param opts.quote_char (default '"'). +--@return there is no writable it returns csv as string +module.dump = function(t, opts, writable) + opts = opts or {} + writable = writable or nil + if not opts.delimiter then + opts.delimiter = ',' + end + if not opts.quote_char then + opts.quote_char = '"' + end + + if (type(writable) ~= "nil" and type(writable.write) ~= "function") + or type(t) ~= "table" then + error("Usage: dump(table[, opts, writable])") + end + local csv = ffi.new('struct csv') + ffi.C.csv_create(csv) + ffi.gc(csv, ffi.C.csv_destroy) + csv.delimiter = string.byte(opts.delimiter) + csv.quote_char = string.byte(opts.quote_char) + + local bufsz = 256 + local buf = csv.realloc(ffi.cast(ffi.typeof('void *'), 0), bufsz) + if type(t[1]) ~= 'table' then + t = {t} + end + local result_table + if type(writable) == 'nil' then + result_table = {} + end + for k, line in pairs(t) do + local first = true + local output_tuple = {} + for k2, field in pairs(line) do + local strf = tostring(field) + local buf_new_size = (strf:len() + 1) * 2 + if buf_new_size > bufsz then + bufsz = buf_new_size + buf = csv.realloc(buf, bufsz) + end + local len = ffi.C.csv_escape_field(csv, strf, + string.len(strf), buf, bufsz) + if first then + first = false + else + output_tuple[#output_tuple + 1] = opts.delimiter + end + output_tuple[#output_tuple + 1] = ffi.string(buf, len) + end + + output_tuple[#output_tuple + 1] = '\n' + if result_table then + result_table[#result_table + 1] = table.concat(output_tuple) + else + writable:write(table.concat(output_tuple)) + end + output_tuple = {} + end + ffi.C.csv_destroy(csv) + csv.realloc(buf, 0) + if result_table then + return table.concat(result_table) + end +end + +return module diff --git a/src/lua/init.cc b/src/lua/init.cc index 240a79005e19bc6b9a7f84a5b54fdf21f3e6ba35..11d7ac945cd8ceaf839ae18fdb69368ead9f6d8f 100644 --- a/src/lua/init.cc +++ b/src/lua/init.cc @@ -97,7 +97,9 @@ extern char strict_lua[], dis_x86_lua[], dis_x64_lua[], dump_lua[], + csv_lua[], v_lua[]; + #if LUAJIT_VERSION_NUM >= 20100 /* LuaJIT 2.1+ */ extern char p_lua[], zone_lua[]; #endif /* LuaJIT 2.1+ */ @@ -115,6 +117,7 @@ static const char *lua_modules[] = { "log", log_lua, "uri", uri_lua, "fio", fio_lua, + "csv", csv_lua, "socket", bsdsocket_lua, "net.box", net_box_lua, "console", console_lua, diff --git a/test/app/csv.result b/test/app/csv.result new file mode 100644 index 0000000000000000000000000000000000000000..bb6183e414db0df7f163e169e7a265f50f9bb438 --- /dev/null +++ b/test/app/csv.result @@ -0,0 +1,10 @@ +TAP version 13 +1..8 +ok - obj test1 +ok - obj test2 +ok - obj test3 +ok - fio test1 +ok - fio test2 +ok - fio test3 +ok - test roundtrip +ok - test load(dump(t)) diff --git a/test/app/csv.test.lua b/test/app/csv.test.lua new file mode 100755 index 0000000000000000000000000000000000000000..a5e9fbf481c501688c76f3d0fdc003ef738a83f9 --- /dev/null +++ b/test/app/csv.test.lua @@ -0,0 +1,113 @@ +#!/usr/bin/env tarantool + +local function table2str(t) + local res = "" + for k, line in pairs(t) do + local s = "" + for k2, field in pairs(line) do + s = s .. '|' .. field .. '|\t' + end + res = res .. s .. '\n' + end + return res +end + +local function myread(self, bytes) + self.i = self.i + bytes + return self.v:sub(self.i - bytes + 1, self.i) +end +local csv = require('csv') +local fio = require('fio') +local tap = require('tap') +local test1_ans = '|a|\t|b|\t\n|1|\t|ha\n"ha"\nha|\t\n|3|\t|4|\t\n' +local test2_ans = '||\t||\t||\t\n||\t||\t\n||\t\n' +local test3_ans = '||\t||\t\n|kp"v|\t\n' +local test4_ans = '|123|\t|5|\t|92|\t|0|\t|0|\t\n|1|\t|12 34|\t|56|\t' .. + '|quote , |\t|66|\t\n|ok|\t\n' +local test5_ans = "|1|\t\n|23|\t|456|\t|abcac|\t|'multiword field 4'|\t\n" .. + "|none|\t|none|\t|0|\t\n||\t||\t||\t\n|aba|\t|adda|\t|f" .. + "3|\t|0|\t\n|local res = internal.pwrite(self.fh|\t|dat" .. + "a|\t|len|\t|offset)|\t\n|iflag = bit.bor(iflag|\t|fio." .. + "c.flag[ flag ])|\t\n||\t||\t||\t\n" +local test6_ans = "|23|\t|456|\t|abcac|\t|'multiword field 4'|\t\n|none|" .. + "\t|none|\t|0|\t\n||\t||\t||\t\n|aba|\t|adda|\t|f3|\t|" .. + "0|\t\n|local res = internal.pwrite(self.fh|\t|data|\t" .. + "|len|\t|offset)|\t\n|iflag = bit.bor(iflag|\t|fio.c.f" .. + "lag[ flag ])|\t\n||\t||\t||\t\n" + +test = tap.test("csv") +test:plan(8) + +readable = {} +readable.read = myread +readable.v = "a,b\n1,\"ha\n\"\"ha\"\"\nha\"\n3,4\n" +readable.i = 0 +test:is(table2str(csv.load(readable)), test1_ans, "obj test1") + +readable.v = ", ,\n , \n\n" +readable.i = 0 +test:is(table2str(csv.load(readable, {chunk_size = 1} )), test2_ans, "obj test2") + +readable.v = ", \r\nkp\"\"v" +readable.i = 0 +test:is(table2str(csv.load(readable, {chunk_size = 3})), test3_ans, "obj test3") + +tmpdir = fio.tempdir() +file1 = fio.pathjoin(tmpdir, 'file.1') +file2 = fio.pathjoin(tmpdir, 'file.2') +file3 = fio.pathjoin(tmpdir, 'file.3') + +local f = fio.open(file1, { 'O_WRONLY', 'O_TRUNC', 'O_CREAT' }, 0777) +f:write("123 , 5 , 92 , 0, 0\n" .. + "1, 12 34, 56, \"quote , \", 66\nok") +f:close() +f = fio.open(file1, {'O_RDONLY'}) +test:is(table2str(csv.load(f, {chunk_size = 10})), test4_ans, "fio test1") +f:close() + + +f = fio.open(file2, { 'O_WRONLY', 'O_TRUNC', 'O_CREAT' }, 0777) +f:write("1\n23,456,abcac,\'multiword field 4\'\n" .. + "none,none,0\n" .. + ",,\n" .. + "aba,adda,f3,0\n" .. + "local res = internal.pwrite(self.fh, data, len, offset)\n" .. + "iflag = bit.bor(iflag, fio.c.flag[ flag ])\n" .. + ",," +) +f:close() +f = fio.open(file2, {'O_RDONLY'}) +--symbol by symbol reading +test:is(table2str(csv.load(f, {chunk_size = 1})), test5_ans, "fio test2") +f:close() + +f = fio.open(file2, {'O_RDONLY'}) +opts = {chunk_size = 7, skip_head_lines = 1} +--7 symbols per chunk +test:is(table2str(csv.load(f, opts)), test6_ans, "fio test3") +f:close() + +t = { + {'quote" d', ',and, comma', 'both " of " t,h,e,m'}, + {'"""', ',","'}, + {'mul\nti\nli\r\nne\n\n', 'field'}, + {""}, + {'"'}, + {"\n"} +} + +f = require("fio").open(file3, { "O_WRONLY", "O_TRUNC" , "O_CREAT"}, 0x1FF) +csv.dump(t, {}, f) +f:close() +f = fio.open(file3, {'O_RDONLY'}) +t2 = csv.load(f, {chunk_size = 5}) +f:close() + +test:is(table2str(t), table2str(t2), "test roundtrip") + +test:is(table2str(t), table2str(csv.load(csv.dump(t))), "test load(dump(t))") + +fio.unlink(file1) +fio.unlink(file2) +fio.unlink(file3) +fio.rmdir(tmpdir) diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index b09b15f25365dcd1b77ce5c60cd9431db511de99..f76c65b22a0048e625455f1e683c2f1b1f98e9df 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -114,3 +114,6 @@ add_executable(reflection_c.test reflection_c.c unit.c ${CMAKE_SOURCE_DIR}/src/reflection.c) add_executable(reflection_cxx.test reflection_cxx.cc unit.c ${CMAKE_SOURCE_DIR}/src/reflection.c) +add_executable(csv.test csv.c + ${CMAKE_SOURCE_DIR}/src/lib/csv/csv.c +) diff --git a/test/unit/csv.c b/test/unit/csv.c new file mode 100644 index 0000000000000000000000000000000000000000..0749f00dbbb5df797748ada27c41a8ed5fb93940 --- /dev/null +++ b/test/unit/csv.c @@ -0,0 +1,399 @@ +/* + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include "csv/csv.h" +#include "unit.h" +#include <stdio.h> +#include <string.h> + +int isendl = 1; +void +print_endl(void *ctx) +{ + fflush(stdout); + puts(""); + isendl = 1; +} +void +print_field(void *ctx, const char *s, const char *end) +{ + if(!isendl) + putchar('\t'); + isendl = 0; + putchar('|'); + for(const char *p = s; p != end && *p; p++) { + if((*p == '\r' || *p == '\n') && (p + 1 == end || (*(p + 1) != '\r' && *(p + 1) != '\n'))) + putchar('\n'); + else + putchar(*p); + } + putchar('|'); + fflush(stdout); +} +void +buf_endl(void *ctx) +{ + *(*((char**)ctx))++ = '\n'; +} +void +buf_field(void *ctx, const char *s, const char *end) +{ + *(*((char**)ctx))++ = '|'; + for(const char *p = s; p != end && *p; p++) { + if((*p == '\r' || *p == '\n') && (p + 1 == end || (*(p + 1) != '\r' && *(p + 1) != '\n'))) + *(*((char**)ctx))++ = '\n'; + else + *(*((char**)ctx))++ = *p; + } + *(*((char**)ctx))++ = '|'; + *(*((char**)ctx))++ = '\t'; +} + +void small_string_test(const char* const s) +{ + struct csv csv; + csv_create(&csv); + csv.emit_field = print_field; + csv.emit_row = print_endl; + csv_parse_chunk(&csv, s, s + strlen(s)); + csv_finish_parsing(&csv); + csv_destroy(&csv); +} + +void +common_test(const char *data) +{ + header(); + small_string_test(data); + footer(); +} + + +void test1() { + header(); + small_string_test("1\n \n1,2,3\n123\n"); + footer(); +} +void test2() { + header(); + small_string_test( + "123,456,abcac,\'multiword field 4\'\n" + "none,none,0\n" + ",,\n" + ",," + ); + footer(); +} + +void test3() { + header(); + small_string_test("1,,2"); + footer(); +} + +void test4() { + header(); + small_string_test("123 , 5 , 92 , 0, 0\n" + "1, 12 34, 56, \"quote , \", 66\nok"); + footer(); +} +void test5() { + header(); + const char * const s = "abc\tlonglonglonglonglonglonglonglonglonglonglonglonglonglonglonglonglonglong\t0\n" + "123\t456\t\n" "0\t\t\n"; + struct csv csv; + csv_create(&csv); + csv.emit_field = print_field; + csv.emit_row = print_endl; + csv_setopt(&csv, CSV_OPT_DELIMITER, '\t'); + csv_parse_chunk(&csv, s, s + strlen(s)); + csv_finish_parsing(&csv); + printf("valid: %s\n", csv.error_status == CSV_ER_INVALID ? "NO" : "yes"); + csv_destroy(&csv); + footer(); +} + +void test6() { + header(); + const char * const s1 = "\n \nabc\nc\"\",\"d\",de\n\nk"; + const char * const s2 = "\ne\n\n \n\" \"\n\"quote isn't closed, sorry\n \noh"; + struct csv csv; + csv_create(&csv); + csv.emit_field = print_field; + csv.emit_row = print_endl; + csv_parse_chunk(&csv, s1, s1 + strlen(s1)); + csv_parse_chunk(&csv, s2, s2 + 2); + csv_parse_chunk(&csv, s2 + 2, s2 + strlen(s2)); + csv_finish_parsing(&csv); + printf("valid: %s\n", csv_get_error_status(&csv) == CSV_ER_INVALID ? "NO" : "yes"); + csv_destroy(&csv); + footer(); +} + +struct counter { + size_t line_cnt, fieldsizes_cnt; +}; + +void +line_counter(void *ctx) +{ + ((struct counter*)ctx)->line_cnt++; +} + +void +fieldsizes_counter(void *ctx, const char *s, const char *end) +{ + ((struct counter*)ctx)->fieldsizes_cnt += end - s; +} + +void big_chunk_separated_test() { + header(); + struct csv csv; + csv_create(&csv); + csv_setopt(&csv, CSV_OPT_EMIT_FIELD, fieldsizes_counter); + csv_setopt(&csv, CSV_OPT_EMIT_ROW, line_counter); + + size_t lines = 10000; + size_t linelen = 300; + size_t chunk_size = 1024; + + char *buf = malloc(lines * (linelen+4)); + size_t bufn = 0; + + struct counter cnt; + cnt.line_cnt = 0; + cnt.fieldsizes_cnt = 0; + csv_setopt(&csv, CSV_OPT_EMIT_CTX, &cnt); + + const char *s = "abc, def, def, cba"; + for(size_t i = 0; i < lines; i++) { + int k = linelen / strlen(s); + for(int i = 0; i < k; i++) { + memcpy(buf + bufn, s, strlen(s)); + bufn += strlen(s); + } + buf[bufn++] = '\n'; + } + + const char *bufp = buf; + while(bufp < buf + bufn - chunk_size) { + csv_parse_chunk(&csv, bufp, bufp + chunk_size); + bufp += chunk_size; + } + csv_parse_chunk(&csv, bufp, buf + bufn); + csv_finish_parsing(&csv); + + //without fieldsizes counts without commas and spaces + printf("line_cnt=%d, fieldsizes_cnt=%d, %d\n", (int)cnt.line_cnt, (int)cnt.fieldsizes_cnt, + (int) (lines * (strlen(s) - 6) * (linelen / strlen(s)))); + fail_unless(lines == cnt.line_cnt); + fail_unless(lines * (strlen(s) - 6) * (linelen / strlen(s)) == cnt.fieldsizes_cnt); + csv_destroy(&csv); + free(buf); + footer(); +} + +void random_generated_test() { + header(); + const char *rand_test = + "\n\r\" ba\r a\ra, \n\"\n\"a\nb\" \raa\rb,\n" + "\r, \n\",\r\n\"\n,a, ,\"a\n\n\r \"\r ba\r,b" + " a,\n,\"\"a\n\r \"b\" \n,\",a\r,a ,\r\rc" + "\" a,b\r\n,\"b\r\"aa \nb \n\r\r\n\n,\rb\nc" + ",\n\n aa\n \"\n ab\rab,\r\" b\n\", ,,\r\r" + "bab\rb\na\n\"a\ra,\"\",\n\"a\n\n \"\r \ra\n" + "a\r\raa a\" ,baab ,a \rbb ,\r \r,\rb,, b" + "\n\r\"\nb\n\nb \n,ab \raa\r\"\nb a\"ba,b, c" + "\"a\"a \"\r\n\"b \n,b\"\",\nba\n\" \n\na \r" + "\nb\rb\"bbba,\" \n\n\n,a,b,a,b,\n\n\n\nb\"\r"; + + struct csv csv; + csv_create(&csv); + csv_setopt(&csv, CSV_OPT_EMIT_FIELD, fieldsizes_counter); + csv_setopt(&csv, CSV_OPT_EMIT_ROW, line_counter); + + struct counter cnt; + cnt.line_cnt = 0; + cnt.fieldsizes_cnt = 0; + csv_setopt(&csv, CSV_OPT_EMIT_CTX, &cnt); + + csv_parse_chunk(&csv, rand_test, rand_test + strlen(rand_test)); + csv_finish_parsing(&csv); + printf("line_cnt=%d, fieldsizes_cnt=%d\n", (int)cnt.line_cnt, (int)cnt.fieldsizes_cnt); + printf("valid: %s\n", csv_get_error_status(&csv) == CSV_ER_INVALID ? "NO" : "yes"); + csv_destroy(&csv); + + footer(); +} + +void iter_test1() { + header(); + struct csv_iterator it; + struct csv csv; + csv_create(&csv); + csv_iterator_create(&it, &csv); + int st = 0; + const char *buf = ",d ,e\r\n12,42,3\no\n"; + while((st = csv_next(&it)) != CSV_IT_EOF) { + switch(st) { + case CSV_IT_NEEDMORE: + csv_feed(&it, buf, strlen(buf)); + buf += strlen(buf); + break; + case CSV_IT_EOL: + print_endl(0); + break; + case CSV_IT_OK: + print_field(0, it.field, it.field + it.field_len); + break; + case CSV_IT_ERROR: + printf("\nerror"); + break; + } + } + csv_destroy(&csv); + footer(); +} + +void iter_test2() { + header(); + struct csv_iterator it; + struct csv csv; + csv_create(&csv); + csv_iterator_create(&it, &csv); + int st = 0; + const char ar[] = {'1', '\n', 0, '2', '3', 0, 0}; + const char *buf = ar; + while((st = csv_next(&it)) != CSV_IT_EOF) { + switch(st) { + case CSV_IT_NEEDMORE: + csv_feed(&it, buf, strlen(buf)); + buf += 3; + break; + case CSV_IT_EOL: + print_endl(0); + break; + case CSV_IT_OK: + print_field(0, it.field, it.field + it.field_len); + break; + case CSV_IT_ERROR: + printf("\nerror"); + break; + } + } + csv_destroy(&csv); + footer(); +} + +void csv_out() { + header(); + + const char fields[4][24] = { "abc", "with,comma", "\"in quotes\"", "1 \" quote"}; + char buf[54]; + int i; + struct csv csv; + csv_create(&csv); + for(i = 0; i < 4; i++) { + int len = csv_escape_field(&csv, fields[i], strlen(fields[i]), buf, sizeof(buf)); + printf("%s<len=%d>%c", buf, len, i == 3 ? '\n' : ','); + } + + footer(); +} + +int main() { + test1(); + test2(); + test3(); + test4(); + test5(); + test6(); // blank lines, invalid csv + big_chunk_separated_test(); + random_generated_test(); + /* comma in quotes */ + common_test( + "first,last,address,city,zip\n" + "John,Doe,120 any st.,\"Anytown, WW\",08123\n" + ); + + /* empty fields */ + common_test( + "a,b,c\n" + "1,\"\",\"\"\n" + "2,3,4\n" + ); + + /* escaped quotes */ + common_test( + "a,b\n" + "1,\"ha \"\"ha\"\" ha\"\n" + "3,4\n" + ); + + /* json in csv */ + common_test( + "key,val\n" + "1,\"{\"\"type\"\": \"\"Point\"\", \"\"coordinates\"\": [102.0, 0.5]}\"\n" + ); + + /* new lines */ + common_test( + "a,b,c\n" + "1,2,3\n" + "\"Once upon \n" + "a time\",5,6\n" + "7,8,9\n" + ); + + /* new lines with quetes */ + common_test( + "a,b\n" + "1,\"ha\n" + "\"\"ha\"\"\n" + "ha\"\n" + "3,4\n" + ); + + /* utf8 */ + common_test( + " a,b,c\n" + "1,2,3\n" + "4,5,а нет ли ошибок?\n" + ); + /* ending spaces */ + common_test(" www , \"aa\"a , \"tt \" \n"); + + + //iterator tests + iter_test1(); + iter_test2(); + + //output test + csv_out(); + return 0; +} diff --git a/test/unit/csv.result b/test/unit/csv.result new file mode 100644 index 0000000000000000000000000000000000000000..d27ee8f7a6b19c29e28ba2afcd9ac83b4bfd4859 --- /dev/null +++ b/test/unit/csv.result @@ -0,0 +1,99 @@ + *** test1 *** +|1| +|| +|1| |2| |3| +|123| + *** test1: done *** + *** test2 *** +|123| |456| |abcac| |'multiword field 4'| +|none| |none| |0| +|| || || +|| || || + *** test2: done *** + *** test3 *** +|1| || |2| + *** test3: done *** + *** test4 *** +|123| |5| |92| |0| |0| +|1| |12 34| |56| |quote , | |66| +|ok| + *** test4: done *** + *** test5 *** +|abc| |longlonglonglonglonglonglonglonglonglonglonglonglonglonglonglonglonglong| |0| +|123| |456| || +|0| || || +valid: yes + *** test5: done *** + *** test6 *** +|| +|| +|abc| +|c"| |d| |de| +|| +|k| +|e| +|| +|| +| | +valid: NO + *** test6: done *** + *** big_chunk_separated_test *** +line_cnt=10000, fieldsizes_cnt=1920000, 1920000 + *** big_chunk_separated_test: done *** + *** random_generated_test *** +line_cnt=40, fieldsizes_cnt=183 +valid: yes + *** random_generated_test: done *** + *** common_test *** +|first| |last| |address| |city| |zip| +|John| |Doe| |120 any st.| |Anytown, WW| |08123| + *** common_test: done *** + *** common_test *** +|a| |b| |c| +|1| |"| |"| +|2| |3| |4| + *** common_test: done *** + *** common_test *** +|a| |b| +|1| |ha "ha" ha| +|3| |4| + *** common_test: done *** + *** common_test *** +|key| |val| +|1| |{"type": "Point", "coordinates": [102.0, 0.5]}| + *** common_test: done *** + *** common_test *** +|a| |b| |c| +|1| |2| |3| +|Once upon +a time| |5| |6| +|7| |8| |9| + *** common_test: done *** + *** common_test *** +|a| |b| +|1| |ha +"ha" +ha| +|3| |4| + *** common_test: done *** + *** common_test *** +|a| |b| |c| +|1| |2| |3| +|4| |5| |а нет ли ошибок?| + *** common_test: done *** + *** common_test *** +|www| |aaa| |tt | + *** common_test: done *** + *** iter_test1 *** +|| |d| |e| +|12| |42| |3| +|o| + *** iter_test1: done *** + *** iter_test2 *** +|1| +|23| + *** iter_test2: done *** + *** csv_out *** +abc<len=3>,"with,comma"<len=12>,""in quotes""<len=13>,1 "" quote<len=10> + *** csv_out: done *** + \ No newline at end of file