diff --git a/CMakeLists.txt b/CMakeLists.txt index 20812e926c2f7d3dd1bab8b3fb833aa38acd4edc..59cf71196f5ba7b302e1230f60c41daf39c7b1df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,7 +77,7 @@ if (HAVE_SENDFILE) endif() endif() check_function_exists(open_memstream HAVE_OPEN_MEMSTREAM) - +check_function_exists(fmemopen HAVE_FMEMOPEN) check_function_exists(funopen HAVE_FUNOPEN) # @@ -305,6 +305,13 @@ include(BuildLibYAML) libyaml_build() add_dependencies(build_bundled_libs yaml) +# +# LibUUID +# + +set(LIBUUID_FIND_REQUIRED ON) +find_package(LibUUID) + # # Third-Party misc # diff --git a/cmake/FindLibUUID.cmake b/cmake/FindLibUUID.cmake new file mode 100644 index 0000000000000000000000000000000000000000..0db8ae7e5e155eb0b371a02909566c449b2ff255 --- /dev/null +++ b/cmake/FindLibUUID.cmake @@ -0,0 +1,31 @@ +if(NOT LIBUUID_FOUND) + find_path(LIBUUID_INCLUDE_DIR + NAMES uuid.h + PATH_SUFFIXES uuid + ) + if (LIBUUID_INCLUDE_DIR) + if (NOT LIBUUID_FIND_QUIETLY) + message(STATUS "Found libuuid includes: ${LIBUUID_INCLUDE_DIR}") + endif () + set(LIBUUID_INCLUDE_DIRS ${LIBUUID_INCLUDE_DIR}) + check_library_exists(uuid uuid_is_null "" HAVE_LIBUUID_LINUX) + if (HAVE_LIBUUID_LINUX) + if (NOT LIBUUID_FIND_QUIETLY) + message(STATUS "Found libuuid library: ${LIBUUID_LIBRARIES}") + endif () + set(LIBUUID_FOUND ON) + set(LIBUUID_LIBRARIES uuid) + else() + check_library_exists(c uuid_is_nil "" HAVE_LIBUUID_BSD) + if (HAVE_LIBUUID_BSD) + set(LIBUUID_FOUND ON) + elseif (LIBUUID_FIND_REQUIRED) + message(FATAL_ERROR "Could not find uuid libraries") + endif() + endif() + elseif(LIBUUID_FIND_REQUIRED) + message(FATAL_ERROR "Could not find uuid development files") + endif() +endif (NOT LIBUUID_FOUND) + +mark_as_advanced(LIBUUID_LIBRARIES LIBUUID_INCLUDE_DIRS) diff --git a/debian/control b/debian/control index c54bb43cb8fb2355f1167b3300ef5544458bd119..a75e10a13487acba64280c921aa5710728623e52 100644 --- a/debian/control +++ b/debian/control @@ -7,7 +7,8 @@ Build-Depends: cdbs, debhelper (>= 8), libncurses5-dev, libiberty-dev | binutils-dev, libmysqlclient-dev, - libpq-dev + libpq-dev, + uuid-dev Section: database Standards-Version: 3.9.5 Homepage: http://tarantool.org/ diff --git a/extra/schema_erase.lua b/extra/schema_erase.lua index 6ee9acfee3d7e9cb7e70b39e1a94d3e6277ff820..b37853908d319efd742b3fe7072bc1fcda2080cf 100644 --- a/extra/schema_erase.lua +++ b/extra/schema_erase.lua @@ -4,6 +4,7 @@ _index = box.space[box.schema.INDEX_ID] _user = box.space[box.schema.USER_ID] _func = box.space[box.schema.FUNC_ID] _priv = box.space[box.schema.PRIV_ID] +_cluster = box.space[box.schema.CLUSTER_ID] -- destroy everything - save snapshot produces an empty snapshot now _schema:run_triggers(false) _schema:truncate() @@ -17,3 +18,5 @@ _func:run_triggers(false) _func:truncate() _priv:run_triggers(false) _priv:truncate() +_cluster:run_triggers(false) +_cluster:truncate() diff --git a/extra/schema_fill.lua b/extra/schema_fill.lua index 975caba8ce9f031ef7482e112631f84c067d44ba..b32f5f1ccfa298f9424047f933987b6720681cec 100644 --- a/extra/schema_fill.lua +++ b/extra/schema_fill.lua @@ -7,6 +7,7 @@ _index = box.space[box.schema.INDEX_ID] _func = box.space[box.schema.FUNC_ID] _user = box.space[box.schema.USER_ID] _priv = box.space[box.schema.PRIV_ID] +_cluster = box.space[box.schema.CLUSTER_ID] -- define schema version _schema:insert{'version', 1, 6} -- define system spaces @@ -16,6 +17,7 @@ _space:insert{_index.n, ADMIN, '_index', 'memtx', 0} _space:insert{_func.n, ADMIN, '_func', 'memtx', 0} _space:insert{_user.n, ADMIN, '_user', 'memtx', 0} _space:insert{_priv.n, ADMIN, '_priv', 'memtx', 0} +_space:insert{_cluster.n, ADMIN, '_cluster', 'memtx', 0} -- define indexes _index:insert{_schema.n, 0, 'primary', 'tree', 1, 1, 0, 'str'} @@ -46,6 +48,11 @@ _index:insert{_func.n, 2, 'name', 'tree', 1, 1, 2, 'str'} _index:insert{_priv.n, 0, 'primary', 'tree', 1, 3, 1, 'num', 2, 'str', 3, 'num'} _index:insert{_priv.n, 1, 'owner', 'tree', 0, 1, 1, 'num'} +-- primary key: node id +_index:insert{_cluster.n, 0, 'primary', 'tree', 1, 1, 0, 'num'} +-- node uuid key: node uuid +_index:insert{_cluster.n, 1, 'uuid', 'tree', 1, 1, 1, 'str'} + -- -- Pre-create user and grants _user:insert{GUEST, '', 'guest'} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 410580ca1c0c9cfc164d38cda1e0c8688616b0ae..fe3e7f5d53324ff828c9b9607ac3e8c13536b2a3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,11 +13,6 @@ include_directories(${LIBCORO_INCLUDE_DIR}) include_directories(${LIBGOPT_INCLUDE_DIR}) include_directories(${READLINE_INCLUDE_DIR}) -# Require pthread globally if compiling with GCC -if (CMAKE_COMPILER_IS_GNUCC) - add_compile_flags("C;CXX" "-pthread") -endif() - # Compile src/lua/*.lua files into src/lua/*.lua.c sources file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/src/lua) set(lua_sources) @@ -75,6 +70,7 @@ set (common_sources cfg.cc cpu_feature.c fiob.c + tt_uuid.c ffisyms.cc lua/init.cc lua/fiber.cc @@ -100,6 +96,7 @@ endif() set_source_files_compile_flags(${common_sources}) add_library(core STATIC ${common_sources}) +target_link_libraries(core pthread) set (common_libraries core small salad misc bitset msgpuck) @@ -142,6 +139,7 @@ if (ENABLE_BACKTRACE AND HAVE_BFD) endif() endif() +set (common_libraries ${common_libraries} ${LIBUUID_LIBRARIES}) set (common_libraries ${common_libraries} PARENT_SCOPE) add_subdirectory(lib) diff --git a/src/admin.cc b/src/admin.cc index 130e0538218070e3c159c094cfee499ab799cb7b..7def35ec83272c31cf9eab4c313795ef3f7df595 100644 --- a/src/admin.cc +++ b/src/admin.cc @@ -97,7 +97,7 @@ admin_handler(va_list ap) for (;;) { if (admin_dispatch(&coio, iobuf, L) < 0) return; - iobuf_gc(iobuf); + iobuf_reset(iobuf); fiber_gc(); } } diff --git a/src/bootstrap.snap b/src/bootstrap.snap index 6ee1bc4150098ff3370e4627f37e8d35b969682b..3b4a138c38523678c62295af347db6f62900a7f7 100644 Binary files a/src/bootstrap.snap and b/src/bootstrap.snap differ diff --git a/src/box/CMakeLists.txt b/src/box/CMakeLists.txt index 0bf3dcc0f313fd23db9034f214f380edff0b20bb..996e4fb7f13d38bf60097cd8302c55a218c25bb3 100644 --- a/src/box/CMakeLists.txt +++ b/src/box/CMakeLists.txt @@ -36,6 +36,7 @@ add_library(box box.cc access.cc authentication.cc + cluster.cc ${lua_sources} lua/call.cc lua/tuple.cc diff --git a/src/box/alter.cc b/src/box/alter.cc index aec77035a1e5d7d81ecda2b3d535f7646ddc24fb..8af36e69279eb7a7a36582dd099ceb887621af0d 100644 --- a/src/box/alter.cc +++ b/src/box/alter.cc @@ -38,6 +38,7 @@ #include <new> /* for placement new */ #include <stdio.h> /* snprintf() */ #include <ctype.h> +#include "cluster.h" /* for cluster_set_uuid() */ /** _space columns */ #define ID 0 @@ -1282,7 +1283,7 @@ func_cache_remove_func(struct trigger * /* trigger */, void *event) static struct trigger drop_func_trigger = { rlist_nil, func_cache_remove_func, NULL, NULL }; -/** Remove a function from function cache */ +/** Replace a function in the function cache */ static void func_cache_replace_func(struct trigger * /* trigger */, void *event) { @@ -1495,6 +1496,105 @@ on_replace_dd_priv(struct trigger * /* trigger */, void *event) /* }}} access control */ +/* {{{ cluster configuration */ + +/** + * Parse a tuple field which is expected to contain a string + * representation of UUID, and return a 16-byte representation. + */ +tt_uuid +tuple_field_uuid(struct tuple *tuple, int fieldno) +{ + const char *value = tuple_field_cstr(tuple, fieldno); + tt_uuid uuid; + if (tt_uuid_from_string(value, &uuid) != 0) + tnt_raise(ClientError, ER_INVALID_UUID, value); + return uuid; +} + +/** + * This trigger is normally invoked only upon initial recovery. + * + * Before a cluster is assigned a cluster id it's read only. + */ +static void +on_replace_dd_schema(struct trigger * /* trigger */, void *event) +{ + struct txn *txn = (struct txn *) event; + struct tuple *old_tuple = txn->old_tuple; + struct tuple *new_tuple = txn->new_tuple; + const char *key = tuple_field_cstr(new_tuple ? + new_tuple : old_tuple, 0); + if (strcmp(key, "cluster") == 0) { + if (old_tuple != NULL || new_tuple == NULL) + tnt_raise(ClientError, ER_CLUSTER_ID_IS_RO); + tt_uuid uu = tuple_field_uuid(new_tuple, 1); + cluster_set_id(&uu); + } +} + +/** + * A record with id of the new node has been synced to the + * write ahead log. Update the cluster configuration with + * a new node. + */ +static void +on_commit_dd_cluster(struct trigger *trigger, void *event) +{ + (void) trigger; + struct txn *txn = (struct txn *) event; + uint32_t node_id = tuple_field_u32(txn->new_tuple, 0); + tt_uuid node_uuid = tuple_field_uuid(txn->new_tuple, 1); + + cluster_add_node(&node_uuid, node_id); +} + +static struct trigger commit_cluster_trigger = + { rlist_nil, on_commit_dd_cluster, NULL, NULL }; + +/** + * A trigger invoked on replace in the space _cluster, + * which contains cluster configuration. + * + * This space is modified by JOIN command in IPROTO + * protocol. + * + * The trigger updates the cluster configuration cache + * with uuid of the newly joined node. + * + * During recovery, it acts the same way, loading identifiers + * of all nodes into the node cache. Node globally unique + * identifiers are used to keep track of cluster configuration, + * so that a node that previously joined the cluster can + * follow updates, and a node that belongs to a different + * cluster can not by mistake join/follow another cluster + * without first being reset (emptied). + */ +static void +on_replace_dd_cluster(struct trigger *trigger, void *event) +{ + (void) trigger; + struct txn *txn = (struct txn *) event; + struct tuple *old_tuple = txn->old_tuple; + struct tuple *new_tuple = txn->new_tuple; + if (old_tuple != NULL || new_tuple == NULL) + tnt_raise(ClientError, ER_NODE_ID_IS_RO); + + /* Check fields */ + uint32_t node_id = tuple_field_u32(new_tuple, 0); + if (cnode_id_is_reserved(node_id)) + tnt_raise(ClientError, ER_NODE_ID_IS_RESERVED, + (unsigned) node_id); + tt_uuid node_uuid = tuple_field_uuid(new_tuple, 1); + if (tt_uuid_is_nil(&node_uuid)) + tnt_raise(ClientError, ER_INVALID_UUID, + tt_uuid_str(&node_uuid)); + + trigger_set(&txn->on_commit, &commit_cluster_trigger); +} + +/* }}} cluster configuration */ + struct trigger alter_space_on_replace_space = { rlist_nil, on_replace_dd_space, NULL, NULL }; @@ -1503,6 +1603,10 @@ struct trigger alter_space_on_replace_index = { rlist_nil, on_replace_dd_index, NULL, NULL }; +struct trigger on_replace_schema = { + rlist_nil, on_replace_dd_schema, NULL, NULL +}; + struct trigger on_replace_user = { rlist_nil, on_replace_dd_user, NULL, NULL }; @@ -1515,4 +1619,8 @@ struct trigger on_replace_priv = { rlist_nil, on_replace_dd_priv, NULL, NULL }; +struct trigger on_replace_cluster = { + rlist_nil, on_replace_dd_cluster, NULL, NULL +}; + /* vim: set foldmethod=marker */ diff --git a/src/box/alter.h b/src/box/alter.h index a563c3771e1fe06e7ba2f19c05d6093b298d7860..d66e810df2c32ffa81127c8c1a07158c5fa25897 100644 --- a/src/box/alter.h +++ b/src/box/alter.h @@ -32,8 +32,10 @@ extern struct trigger alter_space_on_replace_space; extern struct trigger alter_space_on_replace_index; +extern struct trigger on_replace_schema; extern struct trigger on_replace_user; extern struct trigger on_replace_func; extern struct trigger on_replace_priv; +extern struct trigger on_replace_cluster; #endif /* INCLUDES_TARANTOOL_BOX_ALTER_H */ diff --git a/src/box/box.cc b/src/box/box.cc index ae93421c29fe0eb0e76484e2101c0c6904fea6e8..5b9c895d36d03969d545efa0eed80646a3fb03ad 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -56,7 +56,6 @@ #include "cfg.h" #include "iobuf.h" -static void process_replica(struct port *port, struct request *request); static void process_ro(struct port *port, struct request *request); static void process_rw(struct port *port, struct request *request); box_process_func box_process = process_ro; @@ -88,7 +87,6 @@ static void process_rw(struct port *port, struct request *request) { struct txn *txn = txn_begin(); - try { stat_collect(stat_base, request->code, 1); request->execute(request, txn, port); @@ -102,16 +100,6 @@ process_rw(struct port *port, struct request *request) } } -static void -process_replica(struct port *port, struct request *request) -{ - if (!iproto_request_is_select(request->code)) { - tnt_raise(ClientError, ER_NONMASTER, - cfg_gets("replication_source")); - } - return process_rw(port, request); -} - static void process_ro(struct port *port, struct request *request) { @@ -120,37 +108,25 @@ process_ro(struct port *port, struct request *request) return process_rw(port, request); } -static int -recover_row(void *param __attribute__((unused)), - struct iproto_packet *packet) +static void +recover_row(void *param __attribute__((unused)), struct iproto_packet *packet) { - try { - assert(packet->bodycnt == 1); /* always 1 for read */ - struct request request; - request_create(&request, packet->code); - request_decode(&request, (const char *) packet->body[0].iov_base, - packet->body[0].iov_len); - request.packet = packet; - process_rw(&null_port, &request); - } catch (Exception *e) { - e->log(); - return -1; - } - - return 0; + assert(packet->bodycnt == 1); /* always 1 for read */ + struct request request; + request_create(&request, packet->code); + request_decode(&request, (const char *) packet->body[0].iov_base, + packet->body[0].iov_len); + request.packet = packet; + process_rw(&null_port, &request); } static void box_enter_master_or_replica_mode(const char *replication_source) { + box_process = process_rw; if (replication_source != NULL) { - box_process = process_replica; - - recovery_wait_lsn(recovery_state, recovery_state->lsn); recovery_follow_remote(recovery_state, replication_source); - } else { - box_process = process_rw; title("primary", NULL); say_info("I am primary"); } @@ -300,6 +276,63 @@ box_leave_local_standby_mode(void *data __attribute__((unused))) box_enter_master_or_replica_mode(cfg_gets("replication_source")); } +/** + * @brief Called when recovery/replication wants to add a new node + * to cluster. + * cluster_add_node() is called as a commit trigger on _cluster + * space and actually adds the node to the cluster. + * @param node_uuid + */ +static void +box_on_cluster_join(const tt_uuid *node_uuid) +{ + struct space *space = space_cache_find(SC_CLUSTER_ID); + class Index *index = index_find(space, 0); + struct iterator *it = index->position(); + index->initIterator(it, ITER_LE, NULL, 0); + struct tuple *tuple = it->next(it); + uint32_t node_id = tuple ? tuple_field_u32(tuple, 0) + 1 : 1; + + struct request req; + request_create(&req, IPROTO_INSERT); + req.space_id = SC_CLUSTER_ID; + char buf[128]; + char *data = buf; + data = mp_encode_array(data, 2); + data = mp_encode_uint(data, node_id); + data = mp_encode_str(data, tt_uuid_str(node_uuid), UUID_STR_LEN); + assert(data <= buf + sizeof(buf)); + req.tuple = buf; + req.tuple_end = data; + process_rw(&null_port, &req); +} + +static void +box_set_cluster_uuid(struct recovery_state *r) +{ + /* Save Cluster-UUID to _schema space */ + tt_uuid cluster_uuid; + tt_uuid_create(&cluster_uuid); + + const char *key = "cluster"; + struct request req; + request_create(&req, IPROTO_INSERT); + req.space_id = SC_SCHEMA_ID; + char buf[128]; + char *data = buf; + data = mp_encode_array(data, 2); + data = mp_encode_str(data, key, strlen(key)); + data = mp_encode_str(data, tt_uuid_str(&cluster_uuid), UUID_STR_LEN); + assert(data <= buf + sizeof(buf)); + req.tuple = buf; + req.tuple_end = data; + + process_rw(&null_port, &req); + + /* Cluster-UUID was be updated by a _schema trigger */ + assert(tt_uuid_cmp(&r->cluster_uuid, &cluster_uuid) == 0); +} + void box_free(void) { @@ -342,7 +375,7 @@ box_init() /* recovery initialization */ recovery_init(cfg_gets("snap_dir"), cfg_gets("wal_dir"), - recover_row, NULL, box_snapshot_cb, + recover_row, NULL, box_snapshot_cb, box_on_cluster_join, cfg_geti("rows_per_wal")); recovery_update_io_rate_limit(recovery_state, cfg_getd("snap_io_rate_limit")); @@ -353,9 +386,28 @@ box_init() stat_base = stat_register(iproto_request_type_strs, IPROTO_DML_REQUEST_MAX); - recover_snap(recovery_state, cfg_gets("replication_source")); + const char *replication_source = cfg_gets("replication_source"); + if (recovery_has_data(recovery_state)) { + /* Process existing snapshot */ + recover_snap(recovery_state); + recovery_fix_lsn(recovery_state, false); + } else if (replication_source != NULL) { + /* Initialize replica */ + replica_bootstrap(recovery_state, replication_source); + recovery_fix_lsn(recovery_state, false); + snapshot_save(recovery_state); + } else { + /* Initialize cluster */ + cluster_bootstrap(recovery_state); + box_set_cluster_uuid(recovery_state); + recovery_fix_lsn(recovery_state, true); + snapshot_save(recovery_state); + } + + if (tt_uuid_is_nil(&recovery_state->cluster_uuid)) + tnt_raise(ClientError, ER_INVALID_CLUSTER); + space_end_recover_snapshot(); - recover_existing_wals(recovery_state); space_end_recover(); stat_cleanup(stat_base, IPROTO_DML_REQUEST_MAX); @@ -480,14 +532,6 @@ box_snapshot(void) return 0; } -void -box_init_storage(const char *dirname) -{ - struct log_dir dir = snap_dir; - dir.dirname = (char *) dirname; - init_storage_on_master(&dir); -} - void box_info(struct tbuf *out) { diff --git a/src/box/cluster.cc b/src/box/cluster.cc new file mode 100644 index 0000000000000000000000000000000000000000..b4be84de8eb289c203c6e80e1090971b21df037b --- /dev/null +++ b/src/box/cluster.cc @@ -0,0 +1,75 @@ +/* + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include "cluster.h" +#include "recovery.h" +#include "exception.h" + +void +cluster_set_id(const tt_uuid *uu) +{ + /* Set cluster UUID. */ + assert(tt_uuid_is_nil(&recovery_state->cluster_uuid)); + recovery_state->cluster_uuid = *uu; +} + +void +cluster_add_node(const tt_uuid *node_uuid, cnode_id_t node_id) +{ + struct recovery_state *r = recovery_state; + + assert(!tt_uuid_is_nil(node_uuid)); + assert(!cnode_id_is_reserved(node_id)); + + /* Add node */ + struct node *node = (struct node *) calloc(1, sizeof(*node)); + if (node == NULL) { + tnt_raise(ClientError, ER_MEMORY_ISSUE, sizeof(*node), + "recovery", "r->cluster"); + } + node->id = node_id; + node->uuid = *node_uuid; + uint32_t k = mh_cluster_put(recovery_state->cluster, + (const struct node **) &node, NULL, NULL); + if (k == mh_end(recovery_state->cluster)) { + free(node); + tnt_raise(ClientError, ER_MEMORY_ISSUE, sizeof(*node), + "recovery", "r->cluster"); + } + + say_debug("confirm node: {uuid = %s, id = %u}", + tt_uuid_str(node_uuid), node_id); + + /* Confirm Local node */ + if (tt_uuid_cmp(&r->node_uuid, node_uuid) == 0) { + /* Confirm Local Node */ + say_info("synchronized with cluster"); + assert(r->local_node == NULL || r->local_node->id == 0); + r->local_node = node; + } +} diff --git a/src/box/cluster.h b/src/box/cluster.h new file mode 100644 index 0000000000000000000000000000000000000000..40d631410455e1dc066cc9886c839ceaef90fc27 --- /dev/null +++ b/src/box/cluster.h @@ -0,0 +1,107 @@ +#ifndef INCLUDES_BOX_CLUSTER_H +#define INCLUDES_BOX_CLUSTER_H +/* + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include "tt_uuid.h" +#include <stdint.h> +/** + * @module cluster - global state of multi-master + * replicated database. + * + * Right now the cluster can only consist of instances + * connected with asynchronous master-master replication. + * + * Each cluster has a globally unique identifier. Each + * node in the cluster is identified as well. A node + * which is part of one cluster can not join another + * cluster. + * + * Cluster and node identifiers are stored in a system + * space _cluster on all nodes. The node identifier + * is also stored in each snapshot header, this is how + * the node knows which node id in the cluster belongs + * to it. + * + * Cluster and node identifiers are globally unique + * (UUID, universally unique identifiers). In addition + * to a long UUID, which is stored in _cluster system + * space for each node, a short integer id is used for + * pervasive node identification in a replication stream, + * a snapshot, or internal data structures. + * The mapping between 16-byte node globally unique id and + * 4 byte cluster local id is stored in _cluster table. When + * a node joins the cluster, it sends its globally unique + * identifier to one of the masters, and gets its cluster + * local identifier as part of the reply to the JOIN request + * (in fact, it gets it as a REPLACE request in _cluster + * system space along with the rest of the replication + * stream). + * + * Cluster state on each node is represented by a table + * like below: + * + * ---------------------------------- + * | node id | confirmed lsn | + * ---------------------------------- + * | 1 | 1258 | <-- changes of the local node + * ---------------------------------- + * | 2 | 1292 | <-- changes received from + * ---------------------------------- a remote node + */ + +/** Cluster-local node identifier. */ +typedef uint32_t cnode_id_t; + +static inline bool +cnode_id_is_reserved(cnode_id_t id) +{ + return id == 0; +} + +/** + * Bootstrap a new cluster consisting of this node by + * assigning it a new globally unique cluster id. Used + * during bootstrapping in an empty data directory when no + * existing cluster for joining has been provided in the + * database configuration. + */ +void +cluster_set_id(const tt_uuid *uu); + +/** + * Register the universally unique identifier of a remote node and + * a matching cluster-local identifier in the cluster registry. + * Called when a remote master joins the cluster. + * + * The node is added to the cluster lsn table with LSN 0. + */ +void +cluster_add_node(const tt_uuid *node_uu, cnode_id_t id); + +#endif diff --git a/src/box/schema.cc b/src/box/schema.cc index e93d744b4a28d764fbbe9916eccc54db378b7a63..ac32fa5c7897559106ec56b9d3735691e6084d8d 100644 --- a/src/box/schema.cc +++ b/src/box/schema.cc @@ -255,7 +255,7 @@ schema_init() true /* unique */, 1 /* part count */); key_def_set_part(key_def, 0 /* part no */, 0 /* field no */, STRING); - (void) sc_space_new(&def, key_def, NULL); + (void) sc_space_new(&def, key_def, &on_replace_schema); /* _space - home for all spaces. */ key_def->space_id = def.id = SC_SPACE_ID; @@ -280,6 +280,13 @@ schema_init() key_def->space_id = def.id = SC_PRIV_ID; snprintf(def.name, sizeof(def.name), "_priv"); (void) sc_space_new(&def, key_def, &on_replace_priv); + /* + * _cluster - association server uuid <-> server id + * The real index is defined in the snapshot. + */ + key_def->space_id = def.id = SC_CLUSTER_ID; + snprintf(def.name, sizeof(def.name), "_cluster"); + (void) sc_space_new(&def, key_def, &on_replace_cluster); key_def_delete(key_def); /* _index - definition of indexes in all spaces */ diff --git a/src/box/schema.h b/src/box/schema.h index 097cbc7417d6a073017a3679efcbd2e93d3f2db0..eadc0f9c0016ebe756231e93d6c6f0aaa5c8f5d6 100644 --- a/src/box/schema.h +++ b/src/box/schema.h @@ -45,6 +45,8 @@ enum schema_id { SC_USER_ID = 304, /** Space id of _priv. */ SC_PRIV_ID = 312, + /** Space id of _cluster. */ + SC_CLUSTER_ID = 320, /** End of the reserved range of system spaces. */ SC_SYSTEM_ID_MAX = 511 }; diff --git a/src/box/txn.cc b/src/box/txn.cc index 62831885d7e9593511f99738afe0375ff8da6887..91f43ab93be822a4ef31b9462d74132ce69fa58a 100644 --- a/src/box/txn.cc +++ b/src/box/txn.cc @@ -30,7 +30,8 @@ #include "tuple.h" #include "space.h" #include <tarantool.h> -#include <recovery.h> +#include "cluster.h" +#include "recovery.h" #include <fiber.h> #include "request.h" /* for request_name */ @@ -39,18 +40,17 @@ double too_long_threshold; void txn_add_redo(struct txn *txn, struct request *request) { - if (recovery_state->wal_mode == WAL_NONE) + txn->packet = request->packet; + if (recovery_state->wal_mode == WAL_NONE || request->packet != NULL) return; - if (request->packet == NULL) { - /* Generate binary body for Lua requests */ - struct iproto_packet *packet = (struct iproto_packet *) - region_alloc0(&fiber()->gc, sizeof(*packet)); - packet->code = request->code; - packet->bodycnt = request_encode(request, packet->body); - txn->packet = packet; - } else { - txn->packet = request->packet; - } + + /* Generate binary body for Lua requests */ + struct iproto_packet *packet = (struct iproto_packet *) + region_alloc0(&fiber()->gc, sizeof(*packet)); + assert(packet->node_id == 0); /* local request */ + packet->code = request->code; + packet->bodycnt = request_encode(request, packet->body); + txn->packet = packet; } void @@ -93,27 +93,19 @@ txn_commit(struct txn *txn) { if ((txn->old_tuple || txn->new_tuple) && !space_is_temporary(txn->space)) { - struct iproto_packet *packet = txn->packet; - int64_t lsn = next_lsn(recovery_state); - int res = 0; - if (recovery_state->wal_mode != WAL_NONE) { - /* txn_commit() must be done after txn_add_redo() */ - assert(txn->packet != NULL); - packet->lsn = lsn; - ev_tstamp start = ev_now(loop()), stop; - res = wal_write(recovery_state, packet); - stop = ev_now(loop()); + /* txn_commit() must be done after txn_add_redo() */ + assert(recovery_state->wal_mode == WAL_NONE || txn->packet != NULL); + ev_tstamp start = ev_now(loop()), stop; + res = wal_write(recovery_state, txn->packet); + stop = ev_now(loop()); - if (stop - start > too_long_threshold) { - say_warn("too long %s: %.3f sec", - iproto_request_name(packet->code), - stop - start); - } + if (stop - start > too_long_threshold && txn->packet != NULL) { + say_warn("too long %s: %.3f sec", + iproto_request_name(txn->packet->code), + stop - start); } - confirm_lsn(recovery_state, lsn, res == 0); - if (res) tnt_raise(LoggedError, ER_WAL_IO); } diff --git a/src/errcode.h b/src/errcode.h index 586bb944a02de0872bd07424a31ab22e37fd0809..51dcb096d5a3c7d1e5338ac61384ed8e57b6c94e 100644 --- a/src/errcode.h +++ b/src/errcode.h @@ -110,7 +110,13 @@ enum { TNT_ERRMSG_MAX = 512 }; /* 58 */_(ER_RELOAD_CFG, 2, "Can't set option '%s' dynamically") \ /* 59 */_(ER_CFG, 2, "Incorrect option value: %s") \ /* 60 */_(ER_SOPHIA, 2, "%s") \ - + /* 61 */_(ER_LOCAL_NODE_IS_NOT_ACTIVE, 2, "Local node is not active") \ + /* 62 */_(ER_UNKNOWN_NODE, 2, "Unknown node %u") \ + /* 63 */_(ER_INVALID_CLUSTER, 2, "Invalid cluster id") \ + /* 64 */_(ER_INVALID_UUID, 2, "Invalid UUID: %s") \ + /* 65 */_(ER_CLUSTER_ID_IS_RO, 2, "Can't reset cluster id: it is already assigned") \ + /* 66 */_(ER_NODE_ID_IS_RO, 2, "Can't reset node id") \ + /* 67 */_(ER_NODE_ID_IS_RESERVED, 2, "Can't initialize node id with a reserved value %u") \ /* * !IMPORTANT! Please follow instructions at start of the file diff --git a/src/iobuf.cc b/src/iobuf.cc index f47e33dc5ea854a24b409c77ad655f02b3d36a6f..d4837d673a03e3124e69cc5a6df657ff2d354a94 100644 --- a/src/iobuf.cc +++ b/src/iobuf.cc @@ -320,7 +320,7 @@ iobuf_flush(struct iobuf *iobuf, struct ev_io *coio) ssize_t total = coio_writev(coio, iobuf->out.iov, obuf_iovcnt(&iobuf->out), obuf_size(&iobuf->out)); - iobuf_gc(iobuf); + iobuf_reset(iobuf); /* * If there is some residue in the input buffer, move it * but only in case if we don't have iobuf_readahead @@ -335,7 +335,7 @@ iobuf_flush(struct iobuf *iobuf, struct ev_io *coio) } void -iobuf_gc(struct iobuf *iobuf) +iobuf_reset(struct iobuf *iobuf) { /* * If we happen to have fully processed the input, diff --git a/src/iobuf.h b/src/iobuf.h index 0bd43b3cd2218378f59eec463b9decb2194b2473..25d97babc89a8c2e689955175c4da0692d79d03e 100644 --- a/src/iobuf.h +++ b/src/iobuf.h @@ -226,7 +226,7 @@ iobuf_flush(struct iobuf *iobuf, struct ev_io *coio); * Is automatically called by iobuf_flush(). */ void -iobuf_gc(struct iobuf *iobuf); +iobuf_reset(struct iobuf *iobuf); /** Return true if there is no input and no output. */ static inline bool diff --git a/src/iproto.cc b/src/iproto.cc index 83fa53b011ae36a015722706102106f80b72e5e2..7f1be78104102f8b34f5c5ff1c96b45cab8296da 100644 --- a/src/iproto.cc +++ b/src/iproto.cc @@ -98,6 +98,9 @@ iproto_process_disconnect(struct iproto_request *request); static void iproto_process_dml(struct iproto_request *request); +static void +iproto_process_admin(struct iproto_request *request); + struct IprotoRequestGuard { struct iproto_request *ireq; IprotoRequestGuard(struct iproto_request *ireq_arg):ireq(ireq_arg) {} @@ -457,30 +460,6 @@ iproto_connection_input_iobuf(struct iproto_connection *con) return newbuf; } -static void -iproto_process_admin(struct iproto_request *ireq, - struct iproto_connection *con) -{ - switch (ireq->packet.code) { - case IPROTO_PING: - iproto_reply_ping(&ireq->iobuf->out, ireq->packet.sync); - break; - case IPROTO_SUBSCRIBE: - if (ireq->packet.bodycnt != 0) { - tnt_raise(ClientError, ER_INVALID_MSGPACK, - "subscribe request body"); - } - subscribe(con->input.fd, ireq->packet.lsn, ireq->packet.sync); - tnt_raise(IprotoConnectionShutdown); - default: - tnt_raise(ClientError, ER_UNKNOWN_REQUEST_TYPE, - (uint32_t) ireq->packet.code); - } - if (! ev_is_active(&con->output)) - ev_feed_event(con->loop, &con->output, EV_WRITE); -} - - /** Enqueue all requests which were read up. */ static inline void iproto_enqueue_batch(struct iproto_connection *con, struct ibuf *in) @@ -519,6 +498,9 @@ iproto_enqueue_batch(struct iproto_connection *con, struct ibuf *in) iproto_packet_decode(&ireq->packet, &pos, reqend); ireq->total_len = pos - reqstart; /* total request length */ + /* Mark this request as local (see fill_lsn()) */ + ireq->packet.node_id = 0; + /* * sic: in case of exception con->parse_size * as well as in->pos must not be advanced, to @@ -533,14 +515,12 @@ iproto_enqueue_batch(struct iproto_connection *con, struct ibuf *in) pos = (const char *) ireq->packet.body[0].iov_base; request_decode(&ireq->request, pos, ireq->packet.body[0].iov_len); - ireq->request.packet = &ireq->packet; - iproto_queue_push(&request_queue, guard.release()); - /* Request will be discarded in iproto_process_dml */ } else { - iproto_process_admin(ireq, con); - /* Entire request can be discarded. */ - in->pos += ireq->packet.body[0].iov_len; + ireq->process = iproto_process_admin; } + ireq->request.packet = &ireq->packet; + iproto_queue_push(&request_queue, guard.release()); + /* Request will be discarded in iproto_process_XXX */ /* Request is parsed */ con->parse_size -= reqend - reqstart; @@ -635,7 +615,7 @@ iproto_flush(struct iobuf *iobuf, int fd, struct obuf_svp *svp) if (nwr > 0) { if (svp->size + nwr == obuf_size(&iobuf->out)) { - iobuf_gc(iobuf); + iobuf_reset(iobuf); *svp = obuf_create_svp(&iobuf->out); return 0; } @@ -711,6 +691,57 @@ iproto_process_dml(struct iproto_request *ireq) } } +static void +iproto_process_admin(struct iproto_request *ireq) +{ + struct iobuf *iobuf = ireq->iobuf; + struct iproto_connection *con = ireq->connection; + + auto scope_guard = make_scoped_guard([=]{ + /* Discard request (see iproto_enqueue_batch()) */ + iobuf->in.pos += ireq->total_len; + + if (evio_is_active(&con->output)) { + if (! ev_is_active(&con->output)) + ev_feed_event(con->loop, + &con->output, + EV_WRITE); + } else if (iproto_connection_is_idle(con)) { + iproto_connection_delete(con); + } + }); + + if (unlikely(! evio_is_active(&con->output))) + return; + + try { + switch (ireq->packet.code) { + case IPROTO_PING: + iproto_reply_ping(&ireq->iobuf->out, ireq->packet.sync); + break; + case IPROTO_JOIN: + /* TODO: replication authorization */ + session_set_user(con->session, ADMIN, ADMIN); + replication_join(con->input.fd, &ireq->packet); + /* TODO: check requests in `con; queue */ + iproto_connection_shutdown(con); + return; + case IPROTO_SUBSCRIBE: + /* TODO: replication authorization */ + replication_subscribe(con->input.fd, &ireq->packet); + /* TODO: check requests in `con; queue */ + iproto_connection_shutdown(con); + return; + default: + tnt_raise(ClientError, ER_UNKNOWN_REQUEST_TYPE, + (uint32_t) ireq->packet.code); + } + } catch (ClientError *e) { + say_error("admin command error: %s", e->errmsg()); + iproto_reply_error(&iobuf->out, e, ireq->packet.sync); + } +} + static struct iproto_request * iproto_request_new(struct iproto_connection *con, iproto_request_f process) diff --git a/src/iproto_constants.cc b/src/iproto_constants.cc index 7aa1d975e417fbdc3f2477ac370808fa57206ff8..d7b886bb5e2a40b22caf7cec8c40cb3b3648b1c0 100644 --- a/src/iproto_constants.cc +++ b/src/iproto_constants.cc @@ -37,7 +37,7 @@ const unsigned char iproto_key_type[IPROTO_KEY_MAX] = /* {{{ header */ /* 0x00 */ MP_UINT, /* IPROTO_CODE */ /* 0x01 */ MP_UINT, /* IPROTO_SYNC */ - /* 0x02 */ MP_UINT, /* IPROTO_SERVER_ID */ + /* 0x02 */ MP_UINT, /* IPROTO_NODE_ID */ /* 0x03 */ MP_UINT, /* IPROTO_LSN */ /* 0x04 */ MP_DOUBLE, /* IPROTO_TIMESTAMP */ /* }}} */ @@ -83,6 +83,9 @@ const unsigned char iproto_key_type[IPROTO_KEY_MAX] = /* 0x21 */ MP_ARRAY, /* IPROTO_TUPLE */ /* 0x22 */ MP_STR, /* IPROTO_FUNCTION_NAME */ /* 0x23 */ MP_STR, /* IPROTO_USER_NAME */ + /* 0x24 */ MP_STR, /* IPROTO_NODE_UUID */ + /* 0x25 */ MP_STR, /* IPROTO_CLUSTER_UUID */ + /* 0x26 */ MP_MAP, /* IPROTO_LSNMAP */ /* }}} */ }; @@ -94,7 +97,8 @@ const char *iproto_request_type_strs[] = "REPLACE", "UPDATE", "DELETE", - "CALL" + "CALL", + "AUTH" }; void @@ -125,6 +129,9 @@ iproto_packet_decode(struct iproto_packet *packet, const char **pos, case IPROTO_SYNC: packet->sync = mp_decode_uint(pos); break; + case IPROTO_NODE_ID: + packet->node_id = mp_decode_uint(pos); + break; case IPROTO_LSN: packet->lsn = mp_decode_uint(pos); break; @@ -169,6 +176,12 @@ iproto_packet_encode(const struct iproto_packet *packet, struct iovec *iov) map_size++; } + if (packet->node_id) { + d = mp_encode_uint(d, IPROTO_NODE_ID); + d = mp_encode_uint(d, packet->node_id); + map_size++; + } + if (packet->lsn) { d = mp_encode_uint(d, IPROTO_LSN); d = mp_encode_uint(d, packet->lsn); @@ -191,3 +204,34 @@ iproto_packet_encode(const struct iproto_packet *packet, struct iovec *iov) assert(1 + packet->bodycnt <= IPROTO_PACKET_IOVMAX); return 1 + packet->bodycnt; /* new iovcnt */ } + +int +iproto_encode_row(const struct iproto_packet *packet, struct iovec *iov, + char fixheader[IPROTO_FIXHEADER_SIZE]) +{ + int iovcnt = iproto_packet_encode(packet, iov + 1) + 1; + uint32_t len = 0; + for (int i = 1; i < iovcnt; i++) + len += iov[i].iov_len; + + /* Encode length */ + char *data = fixheader; + data = mp_encode_uint(data, len); + /* Encode padding */ + ssize_t padding = IPROTO_FIXHEADER_SIZE - (data - fixheader); + if (padding > 0) { + data = mp_encode_strl(data, padding - 1); +#if defined(NDEBUG) + data += padding - 1; +#else + while (--padding > 0) + *(data++) = 0; /* valgrind */ +#endif + } + assert(data == fixheader + IPROTO_FIXHEADER_SIZE); + iov[0].iov_base = fixheader; + iov[0].iov_len = IPROTO_FIXHEADER_SIZE; + + assert(iovcnt <= IPROTO_ROW_IOVMAX); + return iovcnt; +} diff --git a/src/iproto_constants.h b/src/iproto_constants.h index 3309d043b46ca7bf95390aa19d185bf2ce956df8..07b0180ac76b9d6e9c3bc6dec4cefb5289efce6d 100644 --- a/src/iproto_constants.h +++ b/src/iproto_constants.h @@ -49,8 +49,8 @@ enum { enum iproto_key { IPROTO_CODE = 0x00, IPROTO_SYNC = 0x01, - /* replication keys */ - IPROTO_SERVER_ID = 0x02, + /* Replication keys (header) */ + IPROTO_NODE_ID = 0x02, IPROTO_LSN = 0x03, IPROTO_TIMESTAMP = 0x04, /* Leave a gap for other keys in the header. */ @@ -64,6 +64,10 @@ enum iproto_key { IPROTO_TUPLE = 0x21, IPROTO_FUNCTION_NAME = 0x22, IPROTO_USER_NAME = 0x23, + /* Replication keys (body) */ + IPROTO_NODE_UUID = 0x24, + IPROTO_CLUSTER_UUID = 0x25, + IPROTO_LSNMAP = 0x26, /* Leave a gap between request keys and response keys */ IPROTO_DATA = 0x30, IPROTO_ERROR = 0x31, @@ -72,7 +76,7 @@ enum iproto_key { #define bit(c) (1ULL<<IPROTO_##c) -#define IPROTO_HEAD_BMAP (bit(CODE) | bit(SYNC) | bit(SERVER_ID) | bit(LSN)) +#define IPROTO_HEAD_BMAP (bit(CODE) | bit(SYNC) | bit(NODE_ID) | bit(LSN)) #define IPROTO_BODY_BMAP (bit(SPACE_ID) | bit(INDEX_ID) | bit(LIMIT) |\ bit(OFFSET) | bit(KEY) | bit(TUPLE) | \ bit(FUNCTION_NAME) | bit(USER_NAME)) @@ -104,7 +108,9 @@ enum iproto_request_type { IPROTO_AUTH = 7, IPROTO_DML_REQUEST_MAX = 8, IPROTO_PING = 64, - IPROTO_SUBSCRIBE = 66 + IPROTO_JOIN = 65, + IPROTO_SUBSCRIBE = 66, + IPROTO_SETLSN = 67 }; extern const char *iproto_request_type_strs[]; @@ -138,6 +144,7 @@ enum { struct iproto_packet { uint32_t code; + uint32_t node_id; uint64_t sync; uint64_t lsn; double tm; @@ -151,26 +158,11 @@ iproto_packet_decode(struct iproto_packet *packet, const char **pos, const char int iproto_packet_encode(const struct iproto_packet *packet, struct iovec *out); -struct iproto_subscribe { - uint8_t m_len; /* MP_STR */ - uint32_t v_len; /* length */ - uint8_t m_header; /* MP_MAP */ - uint8_t k_code; /* IPROTO_CODE */ - uint8_t v_code; /* response status */ - uint8_t k_sync; /* IPROTO_SYNC */ - uint8_t m_sync; /* MP_UINT64 */ - uint64_t sync; /* sync */ - uint8_t k_lsn; /* IPROTO_LSN */ - uint8_t m_lsn; /* MP_UINT64 */ - uint64_t lsn; /* lsn */ -} __attribute__((packed)); - -static const struct iproto_subscribe iproto_subscribe_stub = { - 0xce, mp_bswap_u32(sizeof(struct iproto_subscribe) - 5), 0x83, - IPROTO_CODE, IPROTO_SUBSCRIBE, - IPROTO_SYNC, 0xcf, 0, - IPROTO_LSN, 0xcf, 0 -}; +enum { IPROTO_ROW_IOVMAX = IPROTO_PACKET_IOVMAX + 1 }; + +int +iproto_encode_row(const struct iproto_packet *packet, struct iovec *iov, + char fixheader[IPROTO_FIXHEADER_SIZE]); #if defined(__cplusplus) } /* extern "C" */ diff --git a/src/log_io.cc b/src/log_io.cc index 475ec3c83b85f03d222eef237b3afd191efd7246..eb291c7b4918e7a73a751ba9fc761c82ae87715f 100644 --- a/src/log_io.cc +++ b/src/log_io.cc @@ -33,12 +33,13 @@ #include "fiber.h" #include "crc32.h" #include "fio.h" -#include "tarantool_eio.h" +#include "third_party/tarantool_eio.h" #include "fiob.h" #include "msgpuck/msgpuck.h" #include "iproto_constants.h" - -const uint32_t xlog_format = 12; +#include "scoped_guard.h" +#define MH_UNDEF 1 /* conflicts with mh_nodeids_t */ +#include "recovery.h" /* for mh_cluster */ /* * marker is MsgPack fixext2 @@ -51,47 +52,223 @@ const log_magic_t eof_marker = mp_bswap_u32(0xd510aded); /* host byte order */ const char inprogress_suffix[] = ".inprogress"; const char v12[] = "0.12\n"; -struct log_dir snap_dir = { - /* .panic_if_error = */ false, - /* .sync_is_async = */ false, - /* .open_wflags = */ "wxd", - /* .filetype = */ "SNAP\n", - /* .filename_ext = */ ".snap", - /* .dirname = */ NULL, - /* .mode = */ 0660 -}; - -struct log_dir wal_dir = { - /* .panic_if_error = */ false, - /* .sync_is_async = */ true, - /* .open_wflags = */ "wx", - /* .filetype = */ "XLOG\n", - /* .filename_ext = */ ".xlog", - /* .dirname = */ NULL, - /* .mode = */ 0660 -}; +/* {{{ struct log_dir */ -static int -cmp_i64(const void *_a, const void *_b) +static inline int +log_dir_map_cmp(const struct log_meta *a, const struct log_meta *b) { - const int64_t *a = (const int64_t *) _a, *b = (const int64_t *) _b; - if (*a == *b) + if (a->lsnsum != b->lsnsum) + return a->lsnsum - b->lsnsum; + return 0; +} + +rb_gen(, log_dir_map_, log_dir_map_t, struct log_meta, link, log_dir_map_cmp) + +static inline int +log_dir_lsnmap_cmp(const struct log_meta_lsn *a, const struct log_meta_lsn *b) +{ + if (a->node_id != b->node_id) + return a->node_id - b->node_id; + if (a->lsn != b->lsn) + return a->lsn - b->lsn; + + if (a->meta == NULL) /* a is a key */ return 0; - return (*a > *b) ? 1 : -1; + + /* logs with smaller lsnsum are first */ + if (a->meta->lsnsum != b->meta->lsnsum) + return a->meta->lsnsum - b->meta->lsnsum; + + return 0; +} + +rb_gen(, log_dir_lsnmap_, log_dir_lsnmap_t, struct log_meta_lsn, link, + log_dir_lsnmap_cmp) + +#define mh_name _nodeids +#define mh_key_t uint32_t +#define mh_node_t uint32_t +#define mh_arg_t void * +#define mh_hash(a, arg) ((*a)) +#define mh_hash_key(a, arg) (a) +#define mh_eq(a, b, arg) ((*a) == (*b)) +#define mh_eq_key(key, node, arg) (key == (*node)) +#define MH_SOURCE 1 +#include "salad/mhash.h" + +int +log_dir_create(struct log_dir *dir) +{ + memset(dir, 0, sizeof(*dir)); + dir->nodeids = mh_nodeids_new(); + if (dir->nodeids == NULL) + return -1; + log_dir_map_new(&dir->map); + log_dir_lsnmap_new(&dir->lsnmap); + return 0; +} + +static struct log_meta * +log_meta_clean(log_dir_map_t *t, struct log_meta *meta, void *arg); + +void +log_dir_destroy(struct log_dir *dir) +{ + mh_nodeids_delete(dir->nodeids); + free(dir->dirname); + log_dir_map_iter(&dir->map, NULL, log_meta_clean, dir); +} + +void +log_dir_remove_from_index(struct log_dir *dir, struct log_meta *meta) +{ + for (uint32_t i = 0; i < meta->lsn_count; i++) { + log_dir_lsnmap_remove(&dir->lsnmap, &meta->lsns[i]); + } + log_dir_map_remove(&dir->map, meta); + free(meta); } -static ssize_t -scan_dir(struct log_dir *dir, int64_t **ret_lsn) +int +log_dir_add_to_index(struct log_dir *dir, int64_t lsnsum) +{ + struct log_meta key; + key.lsnsum = lsnsum; + struct log_meta *meta = log_dir_map_search(&dir->map, &key); + if (meta != NULL) { + meta->remove_flag = false; + return 0; + } + + /* + * Open xlog to find SETLSN + */ + tt_uuid uuid; + struct log_io *wal = log_io_open_for_read(dir, lsnsum, &uuid, + INPROGRESS); + if (wal == NULL) + return -1; + auto log_guard = make_scoped_guard([&]{ + log_io_close(&wal); + }); + + /* + * Find SETLSN command for xlogs (must be the first) + */ + struct log_io_cursor cur; + log_io_cursor_open(&cur, wal); + struct iproto_packet packet; + if (log_io_cursor_next(&cur, &packet) != 0 || + packet.code != IPROTO_SETLSN) + return -2; + + /* + * Parse SETLSN + */ + uint32_t row_count = 0; + struct log_setlsn_row *rows = log_decode_setlsn(&packet, &row_count); + auto rows_guard = make_scoped_guard([=]{ + free(rows); + }); + + /* + * Update indexes + */ + meta = (struct log_meta *) calloc(1, sizeof(*meta) + + sizeof(*meta->lsns) * row_count); + if (meta == NULL) { + tnt_raise(ClientError, ER_MEMORY_ISSUE, sizeof(*meta), + "log_dir", "meta"); + } + auto meta_guard = make_scoped_guard([=]{ + log_dir_remove_from_index(dir, meta); + free(meta); + }); + + meta->lsnsum = lsnsum; + log_dir_map_insert(&dir->map, meta); + + meta->lsn_count = row_count; + int64_t lsnsum_check = 0; + for (uint32_t i = 0; i < row_count; i++) { + struct log_meta_lsn *meta_lsn = &meta->lsns[i]; + meta_lsn->meta = meta; + meta_lsn->node_id = rows[i].node_id; + meta_lsn->lsn = rows[i].lsn; + lsnsum_check += rows[i].lsn; + log_dir_lsnmap_insert(&dir->lsnmap, meta_lsn); + + uint32_t k; + k = mh_nodeids_find(dir->nodeids, rows[i].node_id, NULL); + if (k != mh_end(dir->nodeids)) + continue; + + /* Update the set of node_ids */ + k = mh_nodeids_put(dir->nodeids, &rows[i].node_id, NULL, NULL); + if (k == mh_end(dir->nodeids)) { + tnt_raise(ClientError, ER_MEMORY_ISSUE, sizeof(*meta), + "log_dir", "meta->nodeids"); + } + } + + /* + * Snapshots have empty starting SETLSN table. Don't check lsnsum and + * use the information derived from xlog name. + */ + if (lsnsum_check != lsnsum && !dir->ignore_initial_setlsn) + tnt_raise(IllegalParams, "Invalid xlog name"); + + meta_guard.is_active = false; + return 0; +} + +static struct log_meta * +log_meta_mark(log_dir_map_t *t, struct log_meta *meta, void *arg) +{ + (void) t; + (void) arg; + meta->remove_flag = true; + return meta; +} + +static struct log_meta * +log_meta_delete(log_dir_map_t *t, struct log_meta *meta, void *arg) +{ + (void) t; + struct log_dir *dir = (struct log_dir *) arg; + if (meta->remove_flag) { + log_dir_remove_from_index(dir, meta); + return NULL; + } + + return meta; +} + +static struct log_meta * +log_meta_clean(log_dir_map_t *t, struct log_meta *meta, void *arg) +{ + (void) t; + struct log_dir *dir = (struct log_dir *) arg; + log_dir_remove_from_index(dir, meta); + return NULL; +} + +int +log_dir_scan(struct log_dir *dir) { - ssize_t result = -1; - size_t i = 0, size = 1000; ssize_t ext_len = strlen(dir->filename_ext); - int64_t *lsn = (int64_t *) region_alloc(&fiber()->gc, - sizeof(int64_t) * size); DIR *dh = opendir(dir->dirname); - if (lsn == NULL || dh == NULL) - goto out; + if (dh == NULL) { + say_syserror("error reading directory `%s'", dir->dirname); + return -1; + } + auto log_guard = make_scoped_guard([&]{ + closedir(dh); + }); + + /* Mark all items to delete */ + log_dir_map_iter(&dir->map, NULL, log_meta_mark, dir); errno = 0; struct dirent *dent; @@ -117,80 +294,128 @@ scan_dir(struct log_dir *dir, int64_t **ret_lsn) if (!ext_is_ok) continue; - lsn[i] = strtoll(dent->d_name, &ext, 10); + long long lsnsum = strtoll(dent->d_name, &ext, 10); if (strncmp(ext, dir->filename_ext, ext_len) != 0) { /* d_name doesn't parse entirely, ignore it */ say_warn("can't parse `%s', skipping", dent->d_name); continue; } - if (lsn[i] == LLONG_MAX || lsn[i] == LLONG_MIN) { + if (lsnsum == LLONG_MAX || lsnsum == LLONG_MIN) { say_warn("can't parse `%s', skipping", dent->d_name); continue; } - i++; - if (i == size) { - int64_t *n = (int64_t *) region_alloc(&fiber()->gc, sizeof(int64_t) * size * 2); - if (n == NULL) - goto out; - memcpy(n, lsn, sizeof(int64_t) * size); - lsn = n; - size = size * 2; - } + int rc = log_dir_add_to_index(dir, lsnsum); + if (rc != 0) + return rc; } - qsort(lsn, i, sizeof(int64_t), cmp_i64); - - *ret_lsn = lsn; - result = i; -out: - if (errno != 0) - say_syserror("error reading directory `%s'", dir->dirname); + /* Delete marked items */ + log_dir_map_iter(&dir->map, NULL, log_meta_delete, dir); - if (dh != NULL) - closedir(dh); - return result; + return 0; } int64_t -greatest_lsn(struct log_dir *dir) +log_dir_greatest(struct log_dir *dir) { - int64_t *lsn; - ssize_t count = scan_dir(dir, &lsn); + struct log_meta *meta = log_dir_map_last(&dir->map); + if (meta == NULL) + return -1; + return meta->lsnsum; +} - if (count <= 0) - return count; +static inline struct log_meta_lsn * +log_dir_lsnmap_lesearch(log_dir_lsnmap_t *tree, struct log_meta_lsn *key) +{ + struct log_meta_lsn *node = log_dir_lsnmap_psearch(tree, key); + if (node == NULL || node->node_id != key->node_id) + return NULL; + + int64_t lsn = node->lsn; + while (1) { + struct log_meta_lsn *next = log_dir_lsnmap_next(tree, node); + if (next == NULL || next->node_id != key->node_id || + next->lsn != lsn) + break; + node = next; + }; + return node; +} - return lsn[count - 1]; +static inline struct log_meta_lsn * +log_dir_lsnmap_gtsearch(log_dir_lsnmap_t *tree, struct log_meta_lsn *key) +{ + struct log_meta_lsn *node = log_dir_lsnmap_nsearch(tree, key); + if (node == NULL || node->node_id != key->node_id) + return NULL; + + int64_t lsn = node->lsn; + while (1) { + struct log_meta_lsn *prev = log_dir_lsnmap_prev(tree, node); + if (prev == NULL || prev->node_id != key->node_id || + prev->lsn != lsn) + break; + node = prev; + }; + return node; } int64_t -find_including_file(struct log_dir *dir, int64_t target_lsn) +log_dir_next(struct log_dir *dir, struct mh_cluster_t *cluster) { - int64_t *lsn; - ssize_t count = scan_dir(dir, &lsn); + int64_t result = INT64_MAX; + uint32_t k; + mh_foreach(dir->nodeids, k) { + /* + * Find file where lsn <= key.lsn for given node_id + */ + struct log_meta_lsn key; + key.node_id = *mh_nodeids_node(dir->nodeids, k); + key.lsn = 0; + key.meta = NULL; /* this node is a key */ + uint32_t m = mh_cluster_find(cluster, key.node_id, NULL); + if (m != mh_end(cluster)) { + struct node *node = *mh_cluster_node(cluster, m); + key.lsn = node->current_lsn; + } - if (count <= 0) - return count; + struct log_meta *meta = NULL; - while (count > 1) { - if (*lsn <= target_lsn && target_lsn < *(lsn + 1)) { - goto out; - return *lsn; + /* + * Find tree node with greatest node.meta.lsnsum where + * node.node_id == key.node_id, node.lsn <= key.lsn + */ + struct log_meta_lsn *meta_lsn = + log_dir_lsnmap_lesearch(&dir->lsnmap, &key); + if (meta_lsn == NULL) { + /* + * Find tree node with smallest node.meta.lsnsum where + * node.node_id == key.node_id, node.lsn > key.lsn + */ + meta_lsn = log_dir_lsnmap_gtsearch(&dir->lsnmap, &key); + if (meta_lsn == NULL) + return INT64_MAX; /* Not found */ + + /* + * Take a previous file + */ + meta = log_dir_map_prev(&dir->map, meta_lsn->meta); + if (meta == NULL) + return INT64_MAX; /* Not found */ + } else { + meta = meta_lsn->meta; } - lsn++; - count--; - } - /* - * we can't check here for sure will or will not last file - * contain record with desired lsn since number of rows in file - * is not known beforehand. so, we simply return the last one. - */ + /* + * Find min([file.lsnsum]) + */ + if (meta->lsnsum < result) + result = meta->lsnsum; + } - out: - return *lsn; + return result; } char * @@ -203,6 +428,95 @@ format_filename(struct log_dir *dir, int64_t lsn, enum log_suffix suffix) return filename; } +void +log_encode_setlsn(struct iproto_packet *packet, struct mh_cluster_t *cluster) +{ + memset(packet, 0, sizeof(*packet)); + packet->code = IPROTO_SETLSN; + /* node_id and lsn should be set to zero for SETLSN command */ + assert(packet->node_id == 0 && packet->lsn == 0); + + uint32_t cluster_size = cluster != NULL ? mh_size(cluster) : 0; + size_t size = 128 + cluster_size * + (mp_sizeof_uint(UINT32_MAX) + mp_sizeof_uint(UINT64_MAX)); + char *buf = (char *) region_alloc(&fiber()->gc, size); + char *data = buf; + data = mp_encode_map(data, 1); + data = mp_encode_uint(data, IPROTO_LSNMAP); + data = mp_encode_map(data, cluster_size); + if (cluster != NULL) { + uint32_t k; + mh_foreach(cluster, k) { + struct node *node = *mh_cluster_node(cluster, k); + data = mp_encode_uint(data, node->id); + data = mp_encode_uint(data, node->current_lsn); + } + } + assert(data <= buf + size); + packet->body[0].iov_base = buf; + packet->body[0].iov_len = (data - buf); + packet->bodycnt = 1; +} + +struct log_setlsn_row * +log_decode_setlsn(struct iproto_packet *packet, uint32_t *p_row_count) +{ + if (packet->bodycnt == 0) + tnt_raise(ClientError, ER_INVALID_MSGPACK, "SETLSN body"); + const char *data = (const char *) packet->body[0].iov_base; + const char *d = data; + if (mp_typeof(*data) != MP_MAP) { + tnt_raise(ClientError, ER_INVALID_MSGPACK, + "SETLSN request body"); + } + const char *lsnmap = NULL; + uint32_t map_size = mp_decode_map(&d); + for (uint32_t i = 0; i < map_size; i++) { + if (mp_typeof(*d) != MP_UINT) { + mp_next(&d); /* key */ + mp_next(&d); /* value */ + continue; + } + uint8_t key = mp_decode_uint(&d); + switch (key) { + case IPROTO_LSNMAP: + if (mp_typeof(*d) != MP_MAP) { + tnt_raise(ClientError, ER_INVALID_MSGPACK, + "invalid LSN Map"); + } + lsnmap = d; + mp_next(&d); + break; + default: + mp_next(&d); /* value */ + } + } + + if (lsnmap == NULL) + tnt_raise(ClientError, ER_INVALID_MSGPACK, "missing LSNMAP"); + + d = lsnmap; + uint32_t row_count = mp_decode_map(&d); + struct log_setlsn_row *rows = (struct log_setlsn_row *) + calloc(row_count, sizeof(*rows)); + if (rows == NULL) { + tnt_raise(LoggedError, ER_MEMORY_ISSUE, sizeof(*rows), + "log_index", "meta"); + } + + for (uint32_t i = 0; i < row_count; i++) { + if (mp_typeof(*d) != MP_UINT) + tnt_raise(ClientError, ER_INVALID_MSGPACK, "LSNMAP"); + rows[i].node_id = mp_decode_uint(&d); + if (mp_typeof(*d) != MP_UINT) + tnt_raise(ClientError, ER_INVALID_MSGPACK, "LSNMAP"); + rows[i].lsn = mp_decode_uint(&d); + } + + *p_row_count = row_count; + return rows; +} + /* }}} */ /* {{{ struct log_io_cursor */ @@ -554,12 +868,18 @@ log_io_sync(struct log_io *l) return 0; } +#define NODE_UUID_KEY "Node" + static int -log_io_write_header(struct log_io *l) +log_io_write_meta(struct log_io *l, const tt_uuid *node_uuid) { - int ret = fprintf(l->f, "%s%s\n", l->dir->filetype, v12); + if (fprintf(l->f, "%s%s", l->dir->filetype, v12) < 0 || + fprintf(l->f, NODE_UUID_KEY ": %s\n\n", + tt_uuid_str(node_uuid)) < 0) { + return -1; + } - return ret < 0 ? -1 : 0; + return 0; } /** @@ -571,7 +891,8 @@ log_io_write_header(struct log_io *l) * @return 0 if success, -1 on error. */ static int -log_io_verify_meta(struct log_io *l, const char **errmsg) +log_io_verify_meta(struct log_io *l, tt_uuid *node_uuid, + const char **errmsg) { char filetype[32], version[32], buf[256]; struct log_dir *dir = l->dir; @@ -596,8 +917,30 @@ log_io_verify_meta(struct log_io *l, const char **errmsg) *errmsg = "failed to read log file header"; goto error; } - if (strcmp(buf, "\n") == 0 || strcmp(buf, "\r\n") == 0) + if (strcmp(buf, "\n") == 0) break; + + /* Parse RFC822-like string */ + char *end = buf + strlen(buf); + if (end > buf && *(end - 1) == '\n') *(--end) = 0; /* skip \n */ + char *key = buf; + char *val = strchr(buf, ':'); + if (val == NULL) { + *errmsg = "invalid meta"; + goto error; + } + *(val++) = 0; + while (*val == ' ') ++val; /* skip starting spaces */ + + if (strcmp(key, NODE_UUID_KEY) == 0) { + if ((end - val) != UUID_STR_LEN || + tt_uuid_from_string(val, node_uuid) != 0) { + *errmsg = "can't parse node uuid"; + goto error; + } + } else { + /* Skip unknown key */ + } } return 0; error: @@ -605,8 +948,8 @@ log_io_verify_meta(struct log_io *l, const char **errmsg) } struct log_io * -log_io_open(struct log_dir *dir, enum log_mode mode, - const char *filename, enum log_suffix suffix, FILE *file) +log_io_open(struct log_dir *dir, enum log_mode mode, const char *filename, + tt_uuid *node_uuid, enum log_suffix suffix, FILE *file) { struct log_io *l = NULL; int save_errno; @@ -630,11 +973,11 @@ log_io_open(struct log_dir *dir, enum log_mode mode, l->dir = dir; l->is_inprogress = suffix == INPROGRESS; if (mode == LOG_READ) { - if (log_io_verify_meta(l, &errmsg) != 0) + if (log_io_verify_meta(l, node_uuid, &errmsg) != 0) goto error; } else { /* LOG_WRITE */ setvbuf(l->f, NULL, _IONBF, 0); - if (log_io_write_header(l) != 0) { + if (log_io_write_meta(l, node_uuid) != 0) { errmsg = strerror(errno); goto error; } @@ -652,13 +995,16 @@ log_io_open(struct log_dir *dir, enum log_mode mode, } struct log_io * -log_io_open_for_read(struct log_dir *dir, int64_t lsn, enum log_suffix suffix) +log_io_open_for_read(struct log_dir *dir, int64_t lsnsum, + tt_uuid *node_uuid, enum log_suffix suffix) { - assert(lsn != 0); - - const char *filename = format_filename(dir, lsn, suffix); + const char *filename = format_filename(dir, lsnsum, suffix); FILE *f = fopen(filename, "r"); - return log_io_open(dir, LOG_READ, filename, suffix, f); + if (suffix == INPROGRESS && f == NULL) { + filename = format_filename(dir, lsnsum, NONE); + f = fopen(filename, "r"); + } + return log_io_open(dir, LOG_READ, filename, node_uuid, suffix, f); } /** @@ -666,7 +1012,8 @@ log_io_open_for_read(struct log_dir *dir, int64_t lsn, enum log_suffix suffix) * and sets errno. */ struct log_io * -log_io_open_for_write(struct log_dir *dir, int64_t lsn, enum log_suffix suffix) +log_io_open_for_write(struct log_dir *dir, int64_t lsn, tt_uuid *node_uuid, + enum log_suffix suffix) { char *filename; FILE *f; @@ -692,7 +1039,7 @@ log_io_open_for_write(struct log_dir *dir, int64_t lsn, enum log_suffix suffix) if (!f) goto error; say_info("creating `%s'", filename); - return log_io_open(dir, LOG_WRITE, filename, suffix, f); + return log_io_open(dir, LOG_WRITE, filename, node_uuid, suffix, f); error: say_syserror("%s: failed to open `%s'", __func__, filename); return NULL; diff --git a/src/log_io.h b/src/log_io.h index 764d767091f11a60bd7531788748c9e2610b54b6..adc783c7252f3635bf50e29325deb63a9d136c63 100644 --- a/src/log_io.h +++ b/src/log_io.h @@ -33,13 +33,12 @@ #include <stdbool.h> #include <sys/uio.h> #include "trivia/util.h" -#include "tarantool_ev.h" +#include "third_party/tarantool_ev.h" #include "iproto_constants.h" +#include "tt_uuid.h" extern const uint32_t xlog_format; -enum log_format { WAL = 65534 }; - enum log_mode { LOG_READ, LOG_WRITE @@ -47,6 +46,48 @@ enum log_mode { enum log_suffix { NONE, INPROGRESS }; +struct log_meta; +struct log_meta_lsn; + +#define RB_COMPACT 1 +#include <third_party/rb.h> + +/* Used by internal functions */ +struct log_meta_lsn { + rb_node(struct log_meta_lsn) link; + int32_t node_id; + int64_t lsn; + struct log_meta *meta; +}; + +/* Used by internal functions */ +struct log_meta { + rb_node(struct log_meta) link; + int64_t lsnsum; + bool remove_flag; /* used internally */ + uint32_t lsn_count; + struct log_meta_lsn lsns[0]; /* [0] is better for clang */ +}; + +/* + * Map: (lsnsum) => (struct log_meta) + */ + +typedef rb_tree(struct log_meta) log_dir_map_t; +rb_proto(, log_dir_map_, log_dir_map_t, struct log_meta) + +/* + * Map: (node_id, lsn) => (struct log_meta) + */ + +typedef rb_tree(struct log_meta_lsn) log_dir_lsnmap_t; +rb_proto(, log_dir_lsnmap_, log_dir_lsnmap_t, struct log_meta_lsn) + +/* + * Set: (node_id) - defined in .cc + */ +struct mh_nodeids_t; + struct log_dir { bool panic_if_error; /** @@ -54,6 +95,8 @@ struct log_dir { * in a separate thread. */ bool sync_is_async; + /* don't check that sum(setlsn) == lsnsum in filename (for snaps) */ + bool ignore_initial_setlsn; /* Additional flags to apply at fopen(2) to write. */ char open_wflags[6]; @@ -62,17 +105,40 @@ struct log_dir { char *dirname; /** File create mode in this directory. */ mode_t mode; + + /* Directory indexes for log_dir_next() */ + log_dir_lsnmap_t lsnmap; + log_dir_map_t map; + struct mh_nodeids_t *nodeids; }; -extern struct log_dir snap_dir; -extern struct log_dir wal_dir; +int +log_dir_create(struct log_dir *dir); +void +log_dir_destroy(struct log_dir *dir); + +int +log_dir_scan(struct log_dir *dir); + +int64_t +log_dir_greatest(struct log_dir *dir); int64_t -greatest_lsn(struct log_dir *dir); +log_dir_next(struct log_dir *dir, struct mh_cluster_t *cluster); + char * format_filename(struct log_dir *dir, int64_t lsn, enum log_suffix suffix); -int64_t -find_including_file(struct log_dir *dir, int64_t target_lsn); + +void +log_encode_setlsn(struct iproto_packet *packet, struct mh_cluster_t *cluster); + +struct log_setlsn_row { + uint32_t node_id; + int64_t lsn; +}; + +struct log_setlsn_row * +log_decode_setlsn(struct iproto_packet *packet, uint32_t *p_size); struct log_io { struct log_dir *dir; @@ -87,12 +153,14 @@ struct log_io { }; struct log_io * -log_io_open_for_read(struct log_dir *dir, int64_t lsn, enum log_suffix suffix); +log_io_open_for_read(struct log_dir *dir, int64_t lsn, tt_uuid *node_uuid, + enum log_suffix suffix); struct log_io * -log_io_open_for_write(struct log_dir *dir, int64_t lsn, enum log_suffix suffix); +log_io_open_for_write(struct log_dir *dir, int64_t lsn, + tt_uuid *node_uuid, enum log_suffix suffix); struct log_io * -log_io_open(struct log_dir *dir, enum log_mode mode, - const char *filename, enum log_suffix suffix, FILE *file); +log_io_open(struct log_dir *dir, enum log_mode mode, const char *filename, + tt_uuid *node_uuid, enum log_suffix suffix, FILE *file); int log_io_sync(struct log_io *l); int diff --git a/src/lua/info.cc b/src/lua/info.cc index e4e4599a0175bcb7c39e407860579f06f00f24bf..4c523db4adc619cf6dfffdfdd2326f64e1933adb 100644 --- a/src/lua/info.cc +++ b/src/lua/info.cc @@ -63,9 +63,25 @@ lbox_info_recovery_last_update_tstamp(struct lua_State *L) } static int -lbox_info_lsn(struct lua_State *L) +lbox_info_node(struct lua_State *L) { - luaL_pushnumber64(L, recovery_state->confirmed_lsn); + lua_pushlstring(L, tt_uuid_str(&recovery_state->node_uuid), UUID_STR_LEN); + return 1; +} + +static int +lbox_info_cluster(struct lua_State *L) +{ + uint32_t cluster_size = mh_size(recovery_state->cluster); + lua_createtable(L, 0, cluster_size); + uint32_t k; + mh_foreach(recovery_state->cluster, k) { + struct node *node = *mh_cluster_node(recovery_state->cluster,k); + lua_pushlstring(L, tt_uuid_str(&node->uuid), UUID_STR_LEN); + luaL_pushnumber64(L, node->confirmed_lsn); + lua_settable(L, -3); + } + return 1; } @@ -103,7 +119,8 @@ lbox_info_dynamic_meta [] = { {"recovery_lag", lbox_info_recovery_lag}, {"recovery_last_update", lbox_info_recovery_last_update_tstamp}, - {"lsn", lbox_info_lsn}, + {"cluster", lbox_info_cluster}, + {"node", lbox_info_node}, {"status", lbox_info_status}, {"uptime", lbox_info_uptime}, {"snapshot_pid", lbox_info_snapshot_pid}, diff --git a/src/lua/uuid.lua b/src/lua/uuid.lua index 0fe76633baef4368bfb45413d851c24dfb5840b3..72c50e811e6dfe680796e440f1ce9d770fdea563 100644 --- a/src/lua/uuid.lua +++ b/src/lua/uuid.lua @@ -10,22 +10,15 @@ int snprintf(char *str, size_t size, const char *format, ...); ]] - local libuuid = nil local builtin = ffi.C - function check_libs() - if libuuid then return end - libuuid = ffi.load('uuid.so.1') - end box.uuid = function() - check_libs() local uuid = ffi.new('uuid_t') - libuuid.uuid_generate(uuid) + builtin.uuid_generate(uuid) return ffi.string(uuid, 16) end box.uuid_hex = function() - check_libs() local uuid = ffi.new('uuid_t') - libuuid.uuid_generate(uuid) + builtin.uuid_generate(uuid) local uuid_hex = ffi.new('char[33]') for i = 0,ffi.sizeof('uuid_t'),1 do builtin.snprintf(uuid_hex + i * 2, 3, "%02x", diff --git a/src/recovery.cc b/src/recovery.cc index 078ca53786981faa20423120337ab002b5a8f123..4a06d01cc73e8b9e1b485f755e9cb032031573a8 100644 --- a/src/recovery.cc +++ b/src/recovery.cc @@ -26,6 +26,7 @@ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +#define MH_SOURCE 1 #include "recovery.h" #include <fcntl.h> @@ -43,6 +44,8 @@ #include "msgpuck/msgpuck.h" #include "iproto_constants.h" #include "crc32.h" +#include "scoped_guard.h" +#include "box/cluster.h" /* * Recovery subsystem @@ -110,90 +113,111 @@ struct recovery_state *recovery_state; const char *wal_mode_STRS[] = { "none", "write", "fsync", "fsync_delay", NULL }; -/* {{{ LSN API */ +/* {{{ mh_cluster definition */ +/** Removes all nodes from mhash */ void -wait_lsn_set(struct wait_lsn *wait_lsn, int64_t lsn) +mh_cluster_clean(struct mh_cluster_t *hash) { - assert(wait_lsn->waiter == NULL); - wait_lsn->waiter = fiber(); - wait_lsn->lsn = lsn; + while (mh_size(hash) > 0) { + mh_int_t k = mh_first(hash); + struct node *node = *mh_cluster_node(hash, k); + mh_cluster_del(hash, k, NULL); + free(node); + } } +/** Gets or creates a node */ +struct node * +mh_cluster_fetch(struct mh_cluster_t *hash, uint32_t node_id) +{ + uint32_t k = mh_cluster_find(hash, node_id, NULL); + if (k != mh_end(hash)) + return *mh_cluster_node(hash, k); + + /* Create node if it doesn't exist */ + struct node *node = (struct node *) calloc(1, sizeof(*node)); + if (node == NULL) + return NULL; + node->id = node_id; + k = mh_cluster_put(hash, (const struct node **) &node, NULL, NULL); + if (k == mh_end(hash)) + return NULL; + return node; +} -/* Alert the waiter, if any. */ -static inline void -wakeup_lsn_waiter(struct recovery_state *r) +/** Calculates sum([node.current_lsn]) */ +static int64_t +mh_cluster_current_sum(struct mh_cluster_t *cluster) { - if (r->wait_lsn.waiter && r->confirmed_lsn >= r->wait_lsn.lsn) { - fiber_wakeup(r->wait_lsn.waiter); + int64_t sum = 0; + uint32_t k; + mh_foreach(cluster, k) { + struct node *node = *mh_cluster_node(cluster, k); + sum += node->current_lsn; } + + return sum; } -void -confirm_lsn(struct recovery_state *r, int64_t lsn, bool is_commit) +/* }}} */ + +/* {{{ LSN API */ + +static struct node * +fill_lsn(struct recovery_state *r, struct iproto_packet *packet) { - assert(r->confirmed_lsn <= r->lsn); + struct node *node = r->local_node; + assert(packet != NULL || node != NULL); + if (packet == NULL || packet->node_id == 0) { + /* Local request */ + if (node == NULL) + tnt_raise(ClientError, ER_LOCAL_NODE_IS_NOT_ACTIVE); + ++node->current_lsn; + if (packet != NULL) { + packet->lsn = node->current_lsn; + packet->node_id = node->id; + } + } else { + /* Remote request */ + uint32_t k = mh_cluster_find(r->cluster, packet->node_id, NULL); + if (k == mh_end(r->cluster)) + tnt_raise(ClientError, ER_UNKNOWN_NODE, packet->node_id); + node = *mh_cluster_node(r->cluster, k); + node->current_lsn = packet->lsn; + } + + return node; +} - if (r->confirmed_lsn < lsn) { +static void +confirm_lsn(struct node *node, int64_t lsn, bool is_commit) +{ + if (node->confirmed_lsn < lsn) { if (is_commit) { - if (r->confirmed_lsn + 1 != lsn) - say_warn("non consecutive LSN, confirmed: %jd, " - " new: %jd, diff: %jd", - (intmax_t) r->confirmed_lsn, + if (node->confirmed_lsn + 1 != lsn) { + say_warn("non consecutive LSN for node %u (%s) " + "confirmed: %jd, new: %jd, diff: %jd", + (unsigned) node->id, + tt_uuid_str(&node->uuid), + (intmax_t) node->confirmed_lsn, (intmax_t) lsn, - (intmax_t) (lsn - r->confirmed_lsn)); - r->confirmed_lsn = lsn; + (intmax_t) (lsn - node->confirmed_lsn)); + } + node->confirmed_lsn = lsn; } } else { - /* + /* * There can be holes in * confirmed_lsn, in case of disk write failure, but * wal_writer never confirms LSNs out order. */ - assert(false); - say_error("LSN is used twice or COMMIT order is broken: " - "confirmed: %jd, new: %jd", - (intmax_t) r->confirmed_lsn, (intmax_t) lsn); - } - wakeup_lsn_waiter(r); -} - -void -set_lsn(struct recovery_state *r, int64_t lsn) -{ - r->lsn = lsn; - r->confirmed_lsn = lsn; - say_debug("set_lsn(%p, %" PRIi64, r, r->lsn); - wakeup_lsn_waiter(r); -} - -/** Wait until the given LSN makes its way to disk. */ -void -recovery_wait_lsn(struct recovery_state *r, int64_t lsn) -{ - while (lsn < r->confirmed_lsn) { - wait_lsn_set(&r->wait_lsn, lsn); - try { - fiber_yield(); - wait_lsn_clear(&r->wait_lsn); - } catch (Exception *e) { - wait_lsn_clear(&r->wait_lsn); - throw; - } + panic("LSN for %s is used twice or COMMIT order is broken: " + "confirmed: %jd, new: %jd", tt_uuid_str(&node->uuid), + (intmax_t) node->confirmed_lsn, (intmax_t) lsn); } } - -int64_t -next_lsn(struct recovery_state *r) -{ - r->lsn++; - say_debug("next_lsn(%p, %" PRIi64, r, r->lsn); - return r->lsn; -} - - /* }}} */ /* {{{ Initial recovery */ @@ -208,7 +232,8 @@ recovery_stop_local(struct recovery_state *r); void recovery_init(const char *snap_dirname, const char *wal_dirname, row_handler row_handler, void *row_handler_param, - snapshot_handler snapshot_handler, int rows_per_wal) + snapshot_handler snapshot_handler, join_handler join_handler, + int rows_per_wal) { assert(recovery_state == NULL); recovery_state = (struct recovery_state *) calloc(1, sizeof(struct recovery_state)); @@ -222,16 +247,52 @@ recovery_init(const char *snap_dirname, const char *wal_dirname, r->row_handler_param = row_handler_param; r->snapshot_handler = snapshot_handler; + r->join_handler = join_handler; + + log_dir_create(&r->snap_dir); + r->snap_dir.panic_if_error = false; + r->snap_dir.sync_is_async = false; + strcpy(r->snap_dir.open_wflags, "wxd"); + r->snap_dir.filetype = "SNAP\n"; + r->snap_dir.filename_ext = ".snap"; + r->snap_dir.dirname = strdup(snap_dirname); + r->snap_dir.mode = 0660; + r->snap_dir.ignore_initial_setlsn = true; + + log_dir_create(&r->wal_dir); + r->wal_dir.panic_if_error = false; + r->wal_dir.sync_is_async = true; + strcpy(r->wal_dir.open_wflags, "wx"); + r->wal_dir.filetype = "XLOG\n"; + r->wal_dir.filename_ext = ".xlog"; + r->wal_dir.dirname = strdup(wal_dirname); + r->wal_dir.mode = 0660; - r->snap_dir = &snap_dir; - r->snap_dir->dirname = strdup(snap_dirname); - r->wal_dir = &wal_dir; - r->wal_dir->dirname = strdup(wal_dirname); if (r->wal_mode == WAL_FSYNC) { - (void) strcat(r->wal_dir->open_wflags, "s"); + (void) strcat(r->wal_dir.open_wflags, "s"); } r->rows_per_wal = rows_per_wal; - wait_lsn_clear(&r->wait_lsn); + + r->cluster = mh_cluster_new(); + if (r->cluster == NULL) + panic("cannot reallocate r->cluster"); + + /* Add a fake node for snapshot/bootstrap */ + struct node *node = (struct node *) calloc(1, sizeof(*node)); + if (node == NULL) + panic("cannot allocate struct node"); + node->id = 0; + assert(tt_uuid_is_nil(&node->uuid)); + uint32_t k = mh_cluster_put(r->cluster, + (const struct node **) &node, NULL, NULL); + if (k == mh_end(r->cluster)) + panic("cannot reallocate r->cluster"); + r->local_node = node; + + if (log_dir_scan(&r->snap_dir) != 0) + panic("can't scan snap directory"); + if (log_dir_scan(&r->wal_dir) != 0) + panic("can't scan wal directory"); } void @@ -274,8 +335,8 @@ recovery_free() if (r->writer) wal_writer_stop(r); - free(r->snap_dir->dirname); - free(r->wal_dir->dirname); + log_dir_destroy(&r->snap_dir); + log_dir_destroy(&r->wal_dir); if (r->current_wal) { /* * Possible if shutting down a replication @@ -284,50 +345,123 @@ recovery_free() log_io_close(&r->current_wal); } + mh_cluster_clean(r->cluster); + mh_cluster_delete(r->cluster); + recovery_state = NULL; } void recovery_setup_panic(struct recovery_state *r, bool on_snap_error, bool on_wal_error) { - r->wal_dir->panic_if_error = on_wal_error; - r->snap_dir->panic_if_error = on_snap_error; + r->wal_dir.panic_if_error = on_wal_error; + r->snap_dir.panic_if_error = on_snap_error; +} + +static void +recovery_process_setlsn(struct recovery_state *r, struct iproto_packet *packet) +{ + say_debug("SETLSN"); + uint32_t row_count; + struct log_setlsn_row *rows = log_decode_setlsn(packet, &row_count); + auto rows_guard = make_scoped_guard([=]{ + free(rows); + }); + + for (uint32_t i = 0; i < row_count; i++) { + uint32_t k = mh_cluster_find(r->cluster, rows[i].node_id, NULL); + if (k == mh_end(r->cluster)) + tnt_raise(ClientError, ER_UNKNOWN_NODE, rows[i].node_id); + + struct node *node = *mh_cluster_node(r->cluster, k); + assert(node->confirmed_lsn == node->current_lsn); + + if (node->current_lsn <= rows[i].lsn) { + say_debug("setting\t(%2u, %020lld)", + node->id, (long long) rows[i].lsn); + node->confirmed_lsn = node->current_lsn = rows[i].lsn; + } else { + /* Ignore outdated SETLSN rows */ + say_debug("skipping\t(%2u, %020lld)", + node->id, (long long) rows[i].lsn); + } + } + say_debug("--"); } -/** Write the bootstrap snapshot. - * - * @return panics on error - * Errors are logged to the log file. - */ void -init_storage_on_master(struct log_dir *dir) +recovery_process(struct recovery_state *r, struct iproto_packet *packet) { - const char *filename = format_filename(dir, 1 /* lsn */, NONE); - int fd = open(filename, O_EXCL|O_CREAT|O_WRONLY, dir->mode); - say_info("saving snapshot `%s'", filename); - if (fd == -1) { - panic_syserror("failed to open snapshot file `%s' for " - "writing", filename); + if (r->relay) + return r->row_handler(r->row_handler_param, packet); + + if (!iproto_request_is_dml(packet->code)) { + /* Process admin commands (node_id, lsn are ignored) */ + switch (packet->code) { + case IPROTO_SETLSN: + recovery_process_setlsn(r, packet); + break; + default: + tnt_raise(ClientError, ER_UNKNOWN_REQUEST_TYPE, + packet->code); + } + return; } - if (write(fd, bootstrap_bin, sizeof(bootstrap_bin)) != - sizeof(bootstrap_bin)) { - panic_syserror("failed to write to snapshot file `%s'", - filename); + + /* Check node_id and lsn */ + uint32_t k = mh_cluster_find(r->cluster, packet->node_id, NULL); + if (k != mh_end(r->cluster)) { + struct node *node = *mh_cluster_node(r->cluster, k); + if (packet->lsn <= node->confirmed_lsn) { + say_debug("skipping too young row"); + return; + } + } else { + say_warn("skipping row with unknown node_id"); + return; } - close(fd); + + return r->row_handler(r->row_handler_param, packet); +} + +void +cluster_bootstrap(struct recovery_state *r) +{ + /* Generate Node-UUID */ + tt_uuid_create(&r->node_uuid); + + /* Recover from bootstrap.snap */ + say_info("initializing cluster"); + FILE *f = fmemopen((void *) &bootstrap_bin, + sizeof(bootstrap_bin), "r"); + tt_uuid bootstrap_uuid; /* ignored */ + struct log_io *snap = log_io_open(&r->snap_dir, LOG_READ, + "bootstrap.snap", &bootstrap_uuid, NONE, f); + assert(snap != NULL); + auto snap_guard = make_scoped_guard([&]{ + log_io_close(&snap); + }); + + int rc = recover_wal(r, snap); + + if (rc != 0) + panic("failed to bootstrap data directory"); + + /* Initialize local node */ + r->join_handler(&r->node_uuid); + assert(r->local_node != NULL); + assert(r->local_node->id == 1); + assert(tt_uuid_cmp(&r->local_node->uuid, &r->node_uuid) == 0); + say_info("done"); } -/** - * Read a snapshot and call row_handler for every snapshot row. - * Panic in case of error. - */ /** * Read a snapshot and call row_handler for every snapshot row. * Panic in case of error. */ void -recover_snap(struct recovery_state *r, const char *replication_source) +recover_snap(struct recovery_state *r) { /* current_wal isn't open during initial recover. */ assert(r->current_wal == NULL); @@ -335,57 +469,32 @@ recover_snap(struct recovery_state *r, const char *replication_source) struct log_io *snap; int64_t lsn; - int rc = 0; - - lsn = greatest_lsn(r->snap_dir); - if (lsn == 0 && greatest_lsn(r->wal_dir) == 0) { - say_info("found an empty data directory, initializing..."); - if (replication_source) { - /* play rows and save snapshot */ - replica_bootstrap(r, replication_source); - snapshot_save(r); - assert(r->lsn == greatest_lsn(r->snap_dir)); - return; - } else { - init_storage_on_master(r->snap_dir); - lsn = greatest_lsn(r->snap_dir); - } - } + if (log_dir_scan(&r->snap_dir) != 0) { + say_error("can't find snapshot"); + goto error; + } + lsn = log_dir_greatest(&r->snap_dir); if (lsn <= 0) { say_error("can't find snapshot"); goto error; } - snap = log_io_open_for_read(r->snap_dir, lsn, NONE); + snap = log_io_open_for_read(&r->snap_dir, lsn, &r->node_uuid, NONE); if (snap == NULL) { say_error("can't find/open snapshot"); goto error; } - say_info("recover from `%s'", snap->filename); - struct log_io_cursor i; - log_io_cursor_open(&i, snap); - - struct iproto_packet packet; - while (log_io_cursor_next(&i, &packet) == 0) { - if (r->row_handler(r->row_handler_param, &packet) < 0) { - say_error("can't apply row"); - if (snap->dir->panic_if_error) - break; - rc = 1; - } + if (tt_uuid_is_nil(&r->node_uuid)) { + say_error("can't find node uuid in snapshot"); + goto error; } - log_io_cursor_close(&i); - log_io_close(&snap); - if (rc == 0) { - r->lsn = r->confirmed_lsn = lsn; - say_info("snapshot recovered, confirmed lsn: %" - PRIi64, r->confirmed_lsn); + say_info("recover from `%s'", snap->filename); + if (recover_wal(r, snap) == 0) return; - } error: - if (greatest_lsn(r->snap_dir) <= 0) { + if (log_dir_greatest(&r->snap_dir) <= 0) { say_crit("didn't you forget to initialize storage with --init-storage switch?"); _exit(1); } @@ -409,20 +518,20 @@ recover_wal(struct recovery_state *r, struct log_io *l) struct iproto_packet packet; while (log_io_cursor_next(&i, &packet) == 0) { - if (packet.lsn <= r->confirmed_lsn) { - say_debug("skipping too young row"); - continue; - } /* * After handler(row) returned, row may be * modified, do not use it. */ - if (r->row_handler(r->row_handler_param, &packet) < 0) { - say_error("can't apply row"); + try { + recovery_process(r, &packet); + } catch (SocketError *e) { + say_error("can't apply row: %s", e->errmsg()); + goto end; + } catch (Exception *e) { + say_error("can't apply row: %s", e->errmsg()); if (l->dir->panic_if_error) goto end; } - set_lsn(r, packet.lsn); } res = i.eof_read ? LOG_EOF : 1; end: @@ -448,35 +557,29 @@ recover_remaining_wals(struct recovery_state *r) char *filename; enum log_suffix suffix; - current_lsn = r->confirmed_lsn + 1; - wal_greatest_lsn = greatest_lsn(r->wal_dir); + if (log_dir_scan(&r->wal_dir) != 0) + return -1; + wal_greatest_lsn = log_dir_greatest(&r->wal_dir); /* if the caller already opened WAL for us, recover from it first */ if (r->current_wal != NULL) goto recover_current_wal; - while (r->confirmed_lsn < wal_greatest_lsn) { - /* - * If a newer WAL appeared in the directory before - * current_wal was fully read, try re-reading - * one last time. */ - if (r->current_wal != NULL) { - if (r->current_wal->retry++ < 3) { - say_warn("`%s' has no EOF marker, yet a newer WAL file exists:" - " trying to re-read (attempt #%d)", - r->current_wal->filename, r->current_wal->retry); - goto recover_current_wal; - } else { - say_warn("WAL `%s' wasn't correctly closed", - r->current_wal->filename); - log_io_close(&r->current_wal); + while (1) { +find_next_wal: + current_lsn = log_dir_next(&r->wal_dir, r->cluster); + if (current_lsn == INT64_MAX) + break; /* No more WALs */ + + if (current_lsn == r->lsnsum) { + if (current_lsn != wal_greatest_lsn) { + say_error("missing xlog between %020lld and %020lld", + (long long) current_lsn, + (long long) wal_greatest_lsn); } + break; } - /* TODO: find a better way of finding the next xlog */ - current_lsn = r->confirmed_lsn; -find_next_wal: - current_lsn++; /* * For the last WAL, first try to open .inprogress * file: if it doesn't exist, we can safely try an @@ -487,13 +590,13 @@ recover_remaining_wals(struct recovery_state *r) suffix = INPROGRESS; if (current_lsn == wal_greatest_lsn) { /* Last WAL present at the time of rescan. */ - filename = format_filename(r->wal_dir, + filename = format_filename(&r->wal_dir, current_lsn, suffix); f = fopen(filename, "r"); } if (f == NULL) { suffix = NONE; - filename = format_filename(r->wal_dir, + filename = format_filename(&r->wal_dir, current_lsn, suffix); f = fopen(filename, "r"); /* @@ -504,12 +607,14 @@ recover_remaining_wals(struct recovery_state *r) current_lsn < wal_greatest_lsn) goto find_next_wal; } - next_wal = log_io_open(r->wal_dir, LOG_READ, filename, suffix, f); + next_wal = log_io_open(&r->wal_dir, LOG_READ, filename, + &r->node_uuid, suffix, f); /* * When doing final recovery, and dealing with the * last file, try opening .<ext>.inprogress. */ if (next_wal == NULL) { + say_warn("open fail: %lu", current_lsn); if (r->finalize && suffix == INPROGRESS) { /* * There is an .inprogress file, but @@ -524,6 +629,7 @@ recover_remaining_wals(struct recovery_state *r) break; } assert(r->current_wal == NULL); + r->lsnsum = current_lsn; r->current_wal = next_wal; say_info("recover from `%s'", r->current_wal->filename); @@ -547,9 +653,28 @@ recover_remaining_wals(struct recovery_state *r) break; } if (result == LOG_EOF) { - say_info("done `%s' confirmed_lsn: %" PRIi64, - r->current_wal->filename, - r->confirmed_lsn); + say_info("done `%s'", r->current_wal->filename); + log_io_close(&r->current_wal); + /* goto find_next_wal; */ + } else if (r->lsnsum == wal_greatest_lsn) { + /* last file is not finished */ + break; + } else if (r->finalize && r->current_wal->is_inprogress) { + say_warn("fail to find eof on inprogress"); + /* Let recovery_finalize deal with last file */ + break; + } else if (r->current_wal->retry++ < 3) { + /* + * If a newer WAL appeared in the directory before + * current_wal was fully read, try re-reading + * one last time. */ + say_warn("`%s' has no EOF marker, yet a newer WAL file exists:" + " trying to re-read (attempt #%d)", + r->current_wal->filename, r->current_wal->retry); + goto recover_current_wal; + } else { + say_warn("WAL `%s' wasn't correctly closed", + r->current_wal->filename); log_io_close(&r->current_wal); } } @@ -558,7 +683,7 @@ recover_remaining_wals(struct recovery_state *r) * It's not a fatal error when last WAL is empty, but if * we lose some logs it is a fatal error. */ - if (wal_greatest_lsn > r->confirmed_lsn + 1) { + if (wal_greatest_lsn > r->lsnsum) { say_error("not all WALs have been successfully read"); result = -1; } @@ -569,30 +694,23 @@ recover_remaining_wals(struct recovery_state *r) return result; } -/** - * Recover all WALs created after the last snapshot. Panic if - * error. - */ void -recover_existing_wals(struct recovery_state *r) +recovery_fix_lsn(struct recovery_state *r, bool master_bootstrap) { - int64_t next_lsn = r->confirmed_lsn + 1; - int64_t wal_lsn = find_including_file(r->wal_dir, next_lsn); - if (wal_lsn <= 0) { - /* No WALs to recover from. */ - goto out; + /* Remove fake snapshot/bootstrap node */ + uint32_t k = mh_cluster_find(r->cluster, 0, NULL); + assert(k != mh_end(r->cluster)); + struct node *node = *mh_cluster_node(r->cluster, k); + if (master_bootstrap) { + assert(r->local_node != NULL); + assert(r->local_node->confirmed_lsn = r->local_node->current_lsn); + r->local_node->current_lsn += node->current_lsn; + r->local_node->confirmed_lsn = r->local_node->current_lsn; } - r->current_wal = log_io_open_for_read(r->wal_dir, wal_lsn, NONE); - if (r->current_wal == NULL) - goto out; - if (recover_remaining_wals(r) < 0) - panic("recover failed"); - say_info("WALs recovered, confirmed lsn: %" PRIi64, r->confirmed_lsn); -out: -#if 0 - region_free(&fiber()->gc); -#endif - ; + mh_cluster_del(r->cluster, k, NULL); + if (r->local_node == node) + r->local_node = NULL; + free(node); } void @@ -606,6 +724,7 @@ recovery_finalize(struct recovery_state *r) r->finalize = true; result = recover_remaining_wals(r); + if (result < 0) panic("unable to successfully finalize recovery"); @@ -622,7 +741,7 @@ recovery_finalize(struct recovery_state *r) say_warn("unlink broken %s WAL", r->current_wal->filename); if (inprogress_log_unlink(r->current_wal->filename) != 0) panic("can't unlink 'inprogress' WAL"); - } else if (r->current_wal->rows == 1) { + } else if (r->current_wal->rows <= 2 /* SETLSN + one row */) { /* Rename inprogress wal with one row */ say_warn("rename unfinished %s WAL", r->current_wal->filename); if (inprogress_log_rename(r->current_wal) != 0) @@ -708,9 +827,7 @@ recovery_rescan_file(ev_loop * loop, ev_stat *w, int /* revents */) if (result < 0) panic("recover failed"); if (result == LOG_EOF) { - say_info("done `%s' confirmed_lsn: %" PRIi64, - r->current_wal->filename, - r->confirmed_lsn); + say_info("done `%s'", r->current_wal->filename); log_io_close(&r->current_wal); recovery_stop_file(watcher); /* Don't wait for wal_dir_rescan_delay. */ @@ -764,6 +881,7 @@ struct wal_write_request { struct fiber *fiber; struct iproto_packet *packet; char wal_fixheader[XLOG_FIXHEADER_SIZE]; + struct node *node; }; /* Context of the WAL writer thread. */ @@ -781,6 +899,7 @@ struct wal_writer bool is_shutdown; bool is_rollback; ev_loop *txn_loop; + struct mh_cluster_t *cluster; }; static pthread_once_t wal_writer_once = PTHREAD_ONCE_INIT; @@ -879,7 +998,7 @@ wal_schedule(ev_loop * /* loop */, ev_async *watcher, int /* event */) * more writers in the future. */ static void -wal_writer_init(struct wal_writer *writer) +wal_writer_init(struct wal_writer *writer, struct mh_cluster_t *cluster) { /* I. Initialize the state. */ pthread_mutexattr_t errorcheck; @@ -908,6 +1027,20 @@ wal_writer_init(struct wal_writer *writer) if (writer->batch == NULL) panic_syserror("fio_batch_alloc"); + + /* Create and fill writer->cluster hash */ + writer->cluster = mh_cluster_new(); + if (writer->cluster == NULL) + panic_syserror("can't reallocate writer->cluster"); + uint32_t k; + mh_foreach(cluster, k) { + struct node *node = *mh_cluster_node(cluster, k); + struct node *wnode = mh_cluster_fetch(writer->cluster, + node->id); + if (wnode == NULL) + panic_syserror("can't reallocate writer->cluster"); + wnode->current_lsn = node->current_lsn; + } } /** Destroy a WAL writer structure. */ @@ -917,6 +1050,8 @@ wal_writer_destroy(struct wal_writer *writer) (void) tt_pthread_mutex_destroy(&writer->mutex); (void) tt_pthread_cond_destroy(&writer->cond); free(writer->batch); + mh_cluster_clean(writer->cluster); + mh_cluster_delete(writer->cluster); } /** WAL writer thread routine. */ @@ -945,7 +1080,7 @@ wal_writer_start(struct recovery_state *r) assert(STAILQ_EMPTY(&wal_writer.commit)); /* I. Initialize the state. */ - wal_writer_init(&wal_writer); + wal_writer_init(&wal_writer, r->cluster); r->writer = &wal_writer; ev_async_start(wal_writer.txn_loop, &wal_writer.write_event); @@ -1001,6 +1136,26 @@ wal_writer_pop(struct wal_writer *writer, struct wal_fifo *input) } } +int +wal_write_setlsn(struct log_io *wal, struct fio_batch *batch, + struct mh_cluster_t *cluster) +{ + /* Write SETLSN command */ + struct iproto_packet setlsn; + char fixheader[XLOG_FIXHEADER_SIZE]; + struct iovec iov[XLOG_ROW_IOVMAX]; + log_encode_setlsn(&setlsn, cluster); + int iovcnt = xlog_encode_row(&setlsn, iov, fixheader); + fio_batch_start(batch, 1); + fio_batch_add(batch, iov, iovcnt); + if (fio_batch_write(batch, fileno(wal->f)) != 1) { + say_error("wal_write_setlsn failed"); + return -1; + } + + return 0; +} + /** * If there is no current WAL, try to open it, and close the * previous WAL. We close the previous WAL only after opening @@ -1016,14 +1171,14 @@ wal_writer_pop(struct wal_writer *writer, struct wal_fifo *input) * @return 0 in case of success, -1 on error. */ static int -wal_opt_rotate(struct log_io **wal, int rows_per_wal, struct log_dir *dir, - int64_t lsn) +wal_opt_rotate(struct log_io **wal, struct fio_batch *batch, + struct recovery_state *r, struct mh_cluster_t *cluster) { struct log_io *l = *wal, *wal_to_close = NULL; ERROR_INJECT_RETURN(ERRINJ_WAL_ROTATE); - if (l != NULL && (l->rows >= rows_per_wal || lsn % rows_per_wal == 0)) { + if (l != NULL && l->rows >= r->rows_per_wal) { /* * if l->rows == 1, log_io_close() does * inprogress_log_rename() for us. @@ -1033,7 +1188,13 @@ wal_opt_rotate(struct log_io **wal, int rows_per_wal, struct log_dir *dir, } if (l == NULL) { /* Open WAL with '.inprogress' suffix. */ - l = log_io_open_for_write(dir, lsn, INPROGRESS); + int64_t lsnsum = mh_cluster_current_sum(cluster); + l = log_io_open_for_write(&r->wal_dir, lsnsum, &r->node_uuid, + INPROGRESS); + if (l != NULL) { + if (wal_write_setlsn(l, batch, cluster) != 0) + log_io_close(&l); + } /* * Close the file *after* we create the new WAL, since * this is when replication relays get an inotify alarm @@ -1048,6 +1209,7 @@ wal_opt_rotate(struct log_io **wal, int rows_per_wal, struct log_dir *dir, * A warning is written to the server * log file. */ + wal_write_setlsn(wal_to_close, batch, cluster); log_io_close(&wal_to_close); } } else if (l->rows == 1) { @@ -1086,7 +1248,7 @@ wal_opt_sync(struct log_io *wal, double sync_delay) static struct wal_write_request * wal_fill_batch(struct log_io *wal, struct fio_batch *batch, int rows_per_wal, - struct wal_write_request *req) + struct wal_write_request *req, struct mh_cluster_t *cluster) { int max_rows = wal->is_inprogress ? 1 : rows_per_wal - wal->rows; /* Post-condition of successful wal_opt_rotate(). */ @@ -1095,6 +1257,11 @@ wal_fill_batch(struct log_io *wal, struct fio_batch *batch, int rows_per_wal, struct iovec iov[XLOG_ROW_IOVMAX]; while (req != NULL && !fio_batch_has_space(batch, nelem(iov))) { + req->node = mh_cluster_fetch(cluster, req->packet->node_id); + if (req->node == NULL) { + say_syserror("can't reallocate writer->cluster"); + return NULL; + } int iovcnt = xlog_encode_row(req->packet, iov, req->wal_fixheader); fio_batch_add(batch, iov, iovcnt); req = STAILQ_NEXT(req, wal_fifo_entry); @@ -1109,6 +1276,8 @@ wal_write_batch(struct log_io *wal, struct fio_batch *batch, int rows_written = fio_batch_write(batch, fileno(wal->f)); wal->rows += rows_written; while (req != end && rows_written-- != 0) { + assert(req->node->id == req->packet->node_id); + req->node->current_lsn = req->packet->lsn; req->res = 0; req = STAILQ_NEXT(req, wal_fifo_entry); } @@ -1127,11 +1296,11 @@ wal_write_to_disk(struct recovery_state *r, struct wal_writer *writer, struct wal_write_request *write_end = req; while (req) { - if (wal_opt_rotate(wal, r->rows_per_wal, r->wal_dir, - req->packet->lsn) != 0) + if (wal_opt_rotate(wal, batch, r, writer->cluster) != 0) break; struct wal_write_request *batch_end; - batch_end = wal_fill_batch(*wal, batch, r->rows_per_wal, req); + batch_end = wal_fill_batch(*wal, batch, r->rows_per_wal, req, + writer->cluster); write_end = wal_write_batch(*wal, batch, req, batch_end); if (batch_end != write_end) break; @@ -1175,8 +1344,10 @@ wal_writer_thread(void *worker_args) ev_async_send(writer->txn_loop, &writer->write_event); } (void) tt_pthread_mutex_unlock(&writer->mutex); - if (r->current_wal != NULL) + if (r->current_wal != NULL) { + wal_write_setlsn(r->current_wal, writer->batch, writer->cluster); log_io_close(&r->current_wal); + } return NULL; } @@ -1187,8 +1358,14 @@ wal_writer_thread(void *worker_args) int wal_write(struct recovery_state *r, struct iproto_packet *packet) { + struct node *node = fill_lsn(r, packet); + if (r->wal_mode == WAL_NONE) { + confirm_lsn(node, node->current_lsn, true); + return 0; + } + + assert(packet != NULL); assert(r->wal_mode != WAL_NONE); - say_debug("wal_write lsn=%" PRIi64, packet->lsn); ERROR_INJECT_RETURN(ERRINJ_WAL_IO); struct wal_writer *writer = r->writer; @@ -1212,8 +1389,10 @@ wal_write(struct recovery_state *r, struct iproto_packet *packet) (void) tt_pthread_mutex_unlock(&writer->mutex); + int64_t lsn = node->current_lsn; /* save current lsn on the stack */ fiber_yield(); /* Request was inserted. */ + confirm_lsn(node, lsn, req->res == 0); return req->res; } @@ -1231,7 +1410,9 @@ snapshot_write_row(struct log_io *l, struct iproto_packet *packet) ev_loop *loop = loop(); packet->tm = last; - packet->lsn = ++rows; + packet->node_id = 0; + if (iproto_request_is_dml(packet->code)) + packet->lsn = ++rows; packet->sync = 0; /* don't write sync to wal */ char fixheader[XLOG_FIXHEADER_SIZE]; @@ -1296,7 +1477,8 @@ snapshot_save(struct recovery_state *r) { assert(r->snapshot_handler != NULL); struct log_io *snap; - snap = log_io_open_for_write(r->snap_dir, r->confirmed_lsn, + int64_t lsnsum = mh_cluster_current_sum(r->cluster); + snap = log_io_open_for_write(&r->snap_dir, lsnsum, &r->node_uuid, INPROGRESS); if (snap == NULL) panic_status(errno, "Failed to save snapshot: failed to open file in write mode."); @@ -1305,12 +1487,19 @@ snapshot_save(struct recovery_state *r) * <lsn>.snap.inprogress. When done, the snapshot is * renamed to <lsn>.snap. */ - say_info("saving snapshot `%s'", - format_filename(r->snap_dir, r->confirmed_lsn, - NONE)); + say_info("saving snapshot `%s'", snap->filename); + + /* Write starting SETLSN (always empty table for snapshot) */ + struct iproto_packet setlsn; + log_encode_setlsn(&setlsn, NULL); + snapshot_write_row(snap, &setlsn); r->snapshot_handler(snap); + /* Write finishing SETLSN */ + log_encode_setlsn(&setlsn, r->cluster); + snapshot_write_row(snap, &setlsn); + log_io_close(&snap); say_info("done"); diff --git a/src/recovery.h b/src/recovery.h index dc343d47a9c9e509a1ef64f649de7b66011e15ff..2b8a7e9884da7f58bfbcdf365d41aea7c79cb157 100644 --- a/src/recovery.h +++ b/src/recovery.h @@ -31,8 +31,9 @@ #include <stdbool.h> #include "trivia/util.h" -#include "tarantool_ev.h" +#include "third_party/tarantool_ev.h" #include "log_io.h" +#include "tt_uuid.h" #if defined(__cplusplus) extern "C" { @@ -41,28 +42,14 @@ extern "C" { struct fiber; struct tbuf; -typedef int (row_handler)(void *, struct iproto_packet *packet); +typedef void (row_handler)(void *, struct iproto_packet *packet); typedef void (snapshot_handler)(struct log_io *); +typedef void (join_handler)(const tt_uuid *node_uuid); /** A "condition variable" that allows fibers to wait when a given * LSN makes it to disk. */ -struct wait_lsn { - struct fiber *waiter; - int64_t lsn; -}; - -void -wait_lsn_set(struct wait_lsn *wait_lsn, int64_t lsn); - -inline static void -wait_lsn_clear(struct wait_lsn *wait_lsn) -{ - wait_lsn->waiter = NULL; - wait_lsn->lsn = 0LL; -} - struct wal_writer; struct wal_watcher; struct remote; @@ -72,15 +59,44 @@ enum wal_mode { WAL_NONE = 0, WAL_WRITE, WAL_FSYNC, WAL_FSYNC_DELAY, WAL_MODE_MA /** String constants for the supported modes. */ extern const char *wal_mode_STRS[]; +/* + * Cluster Node + */ +struct node { + uint32_t id; + tt_uuid uuid; + int64_t current_lsn; + int64_t confirmed_lsn; +}; + +/* + * Map: (node_id) => (struct node) + */ +#define mh_name _cluster +#define mh_key_t uint32_t +#define mh_node_t struct node * +#define mh_arg_t void * +#define mh_hash(a, arg) ((*a)->id) +#define mh_hash_key(a, arg) (a) +#define mh_eq(a, b, arg) ((*a)->id == (*b)->id) +#define mh_eq_key(key, node, arg) (key == (*node)->id) +#include "salad/mhash.h" + +void +mh_cluster_clean(struct mh_cluster_t *hash); + struct recovery_state { - int64_t lsn, confirmed_lsn; + struct mh_cluster_t *cluster; + struct node *local_node; /* The WAL we're currently reading/writing from/to. */ struct log_io *current_wal; - struct log_dir *snap_dir; - struct log_dir *wal_dir; + struct log_dir snap_dir; + struct log_dir wal_dir; + int64_t lsnsum; /* used to find missing xlog files */ struct wal_writer *writer; struct wal_watcher *watcher; struct remote *remote; + bool relay; /* true if recovery initialized for JOIN/SUBSCRIBE */ /** * row_handler is a module callback invoked during initial * recovery and when reading rows from the master. It is @@ -91,11 +107,13 @@ struct recovery_state { row_handler *row_handler; void *row_handler_param; snapshot_handler *snapshot_handler; + join_handler *join_handler; uint64_t snap_io_rate_limit; int rows_per_wal; double wal_fsync_delay; - struct wait_lsn wait_lsn; enum wal_mode wal_mode; + tt_uuid node_uuid; + tt_uuid cluster_uuid; bool finalize; }; @@ -104,28 +122,31 @@ extern struct recovery_state *recovery_state; void recovery_init(const char *snap_dirname, const char *xlog_dirname, row_handler row_handler, void *row_handler_param, - snapshot_handler snapshot_handler, int rows_per_wal); + snapshot_handler snapshot_handler, join_handler join_handler, + int rows_per_wal); void recovery_update_mode(struct recovery_state *r, enum wal_mode mode); void recovery_update_fsync_delay(struct recovery_state *r, double new_delay); void recovery_update_io_rate_limit(struct recovery_state *r, double new_limit); void recovery_free(); -void recover_snap(struct recovery_state *r, const char *replication_source); -void recover_existing_wals(struct recovery_state *); + +static inline bool +recovery_has_data(struct recovery_state *r) +{ + return log_dir_greatest(&r->snap_dir) > 0 || + log_dir_greatest(&r->wal_dir) > 0; +} +void cluster_bootstrap(struct recovery_state *r); +void recover_snap(struct recovery_state *r); void recovery_follow_local(struct recovery_state *r, ev_tstamp wal_dir_rescan_delay); void recovery_finalize(struct recovery_state *r); -int -recover_wal(struct recovery_state *r, struct log_io *l); /* for replication */ +int recover_wal(struct recovery_state *r, struct log_io *l); int wal_write(struct recovery_state *r, struct iproto_packet *packet); void recovery_setup_panic(struct recovery_state *r, bool on_snap_error, bool on_wal_error); - -void confirm_lsn(struct recovery_state *r, int64_t lsn, bool is_commit); -int64_t next_lsn(struct recovery_state *r); -void set_lsn(struct recovery_state *r, int64_t lsn); - -void recovery_wait_lsn(struct recovery_state *r, int64_t lsn); +void recovery_process(struct recovery_state *r, struct iproto_packet *packet); +void recovery_fix_lsn(struct recovery_state *r, bool master_bootstrap); struct fio_batch; @@ -133,8 +154,10 @@ void snapshot_write_row(struct log_io *l, struct iproto_packet *packet); void snapshot_save(struct recovery_state *r); -void -init_storage_on_master(struct log_dir *dir); +/* Only for tests */ +int +wal_write_setlsn(struct log_io *wal, struct fio_batch *batch, + struct mh_cluster_t *cluster); #if defined(__cplusplus) } /* extern "C" */ diff --git a/src/replica.cc b/src/replica.cc index 66b20dfee5fc1e6d6544a69155954075674b6ace..cdd3bb12f9face0cfb5c803d090008d999b706ce 100644 --- a/src/replica.cc +++ b/src/replica.cc @@ -28,7 +28,6 @@ */ #include "recovery.h" #include "tarantool.h" - #include <sys/socket.h> #include <netinet/in.h> #include <arpa/inet.h> @@ -45,11 +44,8 @@ #include "replica.h" static void -remote_apply_row(struct recovery_state *r, struct iproto_packet *packet); - -static void -remote_remote_read_row_fd(struct ev_io *coio, struct iobuf *iobuf, - struct iproto_packet *packet) +remote_read_row(struct ev_io *coio, struct iobuf *iobuf, + struct iproto_packet *packet) { struct ibuf *in = &iobuf->in; @@ -120,6 +116,11 @@ remote_read_row_fd(int sock, struct iproto_packet *packet) void replica_bootstrap(struct recovery_state *r, const char *replication_source) { + say_info("bootstrapping replica"); + + /* Generate Node-UUID */ + tt_uuid_create(&r->node_uuid); + char ip_addr[32]; char greeting[IPROTO_GREETING_SIZE]; int port; @@ -141,55 +142,102 @@ replica_bootstrap(struct recovery_state *r, const char *replication_source) int master = sio_socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); FDGuard guard(master); - assert(r->confirmed_lsn == 0 && r->lsn == 0); uint64_t sync = rand(); /* Send JOIN request */ - struct iproto_subscribe subscribe = iproto_subscribe_stub; - subscribe.sync = mp_bswap_u64(sync); + struct iproto_packet packet; + memset(&packet, 0, sizeof(packet)); + packet.code = IPROTO_JOIN; + packet.sync = sync; + + char buf[128]; + char *data = buf; + data = mp_encode_map(data, 1); + data = mp_encode_uint(data, IPROTO_NODE_UUID); + data = mp_encode_strl(data, UUID_LEN); + tt_uuid_enc_be(&recovery_state->node_uuid, data); + data += UUID_LEN; + + assert(data <= buf + sizeof(buf)); + packet.body[0].iov_base = buf; + packet.body[0].iov_len = (data - buf); + packet.bodycnt = 1; + char fixheader[IPROTO_FIXHEADER_SIZE]; + struct iovec iov[IPROTO_ROW_IOVMAX]; + int iovcnt = iproto_encode_row(&packet, iov, fixheader); + sio_connect(master, &addr, sizeof(addr)); sio_readn(master, greeting, sizeof(greeting)); - sio_write(master, &subscribe, sizeof(subscribe)); + sio_writev_all(master, iov, iovcnt); while (true) { - struct iproto_packet packet; - remote_read_row_fd(master, &packet); - if (packet.sync != sync) - tnt_raise(IllegalParams, "unexpected packet"); + if (packet.sync != sync) { + tnt_raise(ClientError, ER_INVALID_MSGPACK, + "unexpected packet sync"); + } /* Recv JOIN response (= end of stream) */ - if (packet.code == IPROTO_SUBSCRIBE) { + if (packet.code == IPROTO_JOIN) { if (packet.bodycnt != 0) - tnt_raise(IllegalParams, "subscribe response body"); - set_lsn(r, packet.lsn); + tnt_raise(IllegalParams, "JOIN body"); say_info("done"); break; } - remote_apply_row(r, &packet); + recovery_process(r, &packet); } + say_info("done"); /* master socket closed by guard */ } static void -remote_connect(struct ev_io *coio, struct sockaddr_in *remote_addr, - int64_t initial_lsn, const char **err) +remote_connect(struct recovery_state *r, struct ev_io *coio,const char **err) { char greeting[IPROTO_GREETING_SIZE]; evio_socket(coio, AF_INET, SOCK_STREAM, IPPROTO_TCP); *err = "can't connect to master"; - coio_connect(coio, remote_addr); + coio_connect(coio, &r->remote->addr); coio_readn(coio, greeting, sizeof(greeting)); - /* Send JOIN request */ - struct iproto_subscribe request = iproto_subscribe_stub; - request.lsn = mp_bswap_u64(initial_lsn); - coio_write(coio, &request, sizeof(request)); + /* Send SUBSCRIBE request */ + struct iproto_packet packet; + memset(&packet, 0, sizeof(packet)); + packet.code = IPROTO_SUBSCRIBE; + + uint32_t cluster_size = mh_size(r->cluster); + size_t size = 128 + cluster_size * + (mp_sizeof_uint(UINT32_MAX) + mp_sizeof_uint(UINT64_MAX)); + char *buf = (char *) region_alloc(&fiber()->gc, size); + char *data = buf; + data = mp_encode_map(data, 3); + data = mp_encode_uint(data, IPROTO_CLUSTER_UUID); + data = mp_encode_strl(data, UUID_LEN); + tt_uuid_enc_be(&r->cluster_uuid, data); + data += UUID_LEN; + data = mp_encode_uint(data, IPROTO_NODE_UUID); + data = mp_encode_strl(data, UUID_LEN); + tt_uuid_enc_be(&recovery_state->node_uuid, data); + data += UUID_LEN; + data = mp_encode_uint(data, IPROTO_LSNMAP); + data = mp_encode_map(data, cluster_size); + uint32_t k; + mh_foreach(r->cluster, k) { + struct node *node = *mh_cluster_node(r->cluster, k); + data = mp_encode_uint(data, node->id); + data = mp_encode_uint(data, node->current_lsn); + } + assert(data <= buf + size); + packet.body[0].iov_base = buf; + packet.body[0].iov_len = (data - buf); + packet.bodycnt = 1; + char fixheader[IPROTO_FIXHEADER_SIZE]; + struct iovec iov[IPROTO_ROW_IOVMAX]; + int iovcnt = iproto_encode_row(&packet, iov, fixheader); + coio_writev(coio, iov, iovcnt, 0); say_crit("successfully connected to master"); - say_crit("starting replication from lsn: %" PRIi64, initial_lsn); } static void @@ -213,15 +261,14 @@ pull_from_remote(va_list ap) "connecting"); if (iobuf == NULL) iobuf = iobuf_new(fiber_name(fiber())); - remote_connect(&coio, &r->remote->addr, - r->confirmed_lsn + 1, &err); + remote_connect(r, &coio, &err); warning_said = false; title("replica", "%s/%s", r->remote->source, "connected"); } err = "can't read row"; struct iproto_packet packet; - remote_remote_read_row_fd(&coio, iobuf, &packet); + remote_read_row(&coio, iobuf, &packet); fiber_setcancellable(false); err = NULL; @@ -229,9 +276,9 @@ pull_from_remote(va_list ap) r->remote->recovery_last_update_tstamp = ev_now(loop); - remote_apply_row(r, &packet); + recovery_process(r, &packet); - iobuf_gc(iobuf); + iobuf_reset(iobuf); fiber_gc(); } catch (FiberCancelException *e) { title("replica", "%s/%s", r->remote->source, "failed"); @@ -268,15 +315,6 @@ pull_from_remote(va_list ap) } } -static void -remote_apply_row(struct recovery_state *r, struct iproto_packet *packet) -{ - if (r->row_handler(r->row_handler_param, packet) < 0) - panic("replication failure: can't apply row"); - - set_lsn(r, packet->lsn); -} - void recovery_follow_remote(struct recovery_state *r, const char *addr) { diff --git a/src/replication.cc b/src/replication.cc index 89b3fde0165507c483ab692e34055316bad24e05..ce303d670704f2f44f53ea6c0eafe861a60fe9e6 100644 --- a/src/replication.cc +++ b/src/replication.cc @@ -135,7 +135,7 @@ spawner_sigchld_handler(int signal __attribute__((unused))); * @return 0 on success, -1 on error */ static int -spawner_create_replication_relay(); +spawner_create_replication_relay(struct relay_data *data); /** Shut down all relays when shutting down the spawner. */ static void @@ -143,7 +143,7 @@ spawner_shutdown_children(); /** Initialize replication relay process. */ static void -replication_relay_loop(); +replication_relay_loop(struct relay_data *data); /* * ------------------------------------------------------------------------ @@ -194,27 +194,202 @@ replication_prefork(const char *snap_dir, const char *wal_dir) /*-----------------------------------------------------------------------------*/ /** State of subscribe request - master process. */ -struct subscribe_request { +struct relay_data { + uint32_t code; + uint64_t sync; + + /* for SUBSCRIBE */ + uint32_t node_id; + uint32_t lsnmap_size; + struct { + uint32_t node_id; + int64_t lsn; + } lsnmap[]; +}; + +struct replication_request { struct ev_io io; int fd; - int64_t lsn; - uint64_t sync; + struct relay_data data; }; /** Replication acceptor fiber handler. */ void -subscribe(int fd, int64_t lsn, uint64_t sync) +replication_join(int fd, struct iproto_packet *packet) { - struct subscribe_request *request = (struct subscribe_request *) - malloc(sizeof(struct subscribe_request)); + assert(packet->code == IPROTO_JOIN); + if (packet->bodycnt == 0) + tnt_raise(ClientError, ER_INVALID_MSGPACK, "JOIN body"); + + const char *data = (const char *) packet->body[0].iov_base; + const char *end = data + packet->body[0].iov_len; + const char *d = data; + if (mp_check(&d, end) != 0 || mp_typeof(*data) != MP_MAP) + tnt_raise(ClientError, ER_INVALID_MSGPACK, "JOIN body"); + + tt_uuid node_uuid = uuid_nil; + d = data; + uint32_t map_size = mp_decode_map(&d); + for (uint32_t i = 0; i < map_size; i++) { + if (mp_typeof(*d) != MP_UINT) { + mp_next(&d); /* key */ + mp_next(&d); /* value */ + continue; + } + uint8_t key = mp_decode_uint(&d); + if (key == IPROTO_NODE_UUID) { + if (mp_typeof(*d) != MP_STR || + mp_decode_strl(&d) != UUID_LEN) { + tnt_raise(ClientError, ER_INVALID_MSGPACK, + "invalid Node-UUID"); + } + tt_uuid_dec_be(d, &node_uuid); + d += UUID_LEN; + } else { + mp_next(&d); /* value */ + } + } + + if (tt_uuid_is_nil(&node_uuid)) { + tnt_raise(ClientError, ER_INVALID_MSGPACK, + "Can't find Node-UUID in JOIN request"); + } + + /* Notify box about new cluster node */ + recovery_state->join_handler(&node_uuid); + + struct replication_request *request = (struct replication_request *) + malloc(sizeof(*request)); if (request == NULL) { - close(fd); - return; + tnt_raise(ClientError, ER_MEMORY_ISSUE, sizeof(*request), + "iproto", "JOIN"); } request->fd = fd; request->io.data = request; - request->lsn = lsn; - request->sync = sync; + request->data.code = packet->code; + request->data.sync = packet->sync; + + ev_io_init(&request->io, replication_send_socket, + master_to_spawner_socket, EV_WRITE); + ev_io_start(loop(), &request->io); +} + +/** Replication acceptor fiber handler. */ +void +replication_subscribe(int fd, struct iproto_packet *packet) +{ + assert(packet->code == IPROTO_SUBSCRIBE); + if (packet->bodycnt == 0) + tnt_raise(ClientError, ER_INVALID_MSGPACK, "subscribe body"); + assert(packet->bodycnt == 1); + const char *data = (const char *) packet->body[0].iov_base; + const char *end = data + packet->body[0].iov_len; + const char *d = data; + if (mp_check(&d, end) != 0 || mp_typeof(*data) != MP_MAP) + tnt_raise(ClientError, ER_INVALID_MSGPACK, "subscribe body"); + tt_uuid cluster_uuid = uuid_nil, node_uuid = uuid_nil; + + const char *lsnmap = NULL; + d = data; + uint32_t map_size = mp_decode_map(&d); + for (uint32_t i = 0; i < map_size; i++) { + if (mp_typeof(*d) != MP_UINT) { + mp_next(&d); /* key */ + mp_next(&d); /* value */ + continue; + } + uint8_t key = mp_decode_uint(&d); + switch (key) { + case IPROTO_CLUSTER_UUID: + if (mp_typeof(*d) != MP_STR || + mp_decode_strl(&d) != UUID_LEN) { + tnt_raise(ClientError, ER_INVALID_MSGPACK, + "invalid Cluster-UUID"); + } + tt_uuid_dec_be(d, &cluster_uuid); + d += UUID_LEN; + break; + case IPROTO_NODE_UUID: + if (mp_typeof(*d) != MP_STR || + mp_decode_strl(&d) != UUID_LEN) { + tnt_raise(ClientError, ER_INVALID_MSGPACK, + "invalid Node-UUID"); + } + tt_uuid_dec_be(d, &node_uuid); + d += UUID_LEN; + break; + case IPROTO_LSNMAP: + if (mp_typeof(*d) != MP_MAP) { + tnt_raise(ClientError, ER_INVALID_MSGPACK, + "invalid LSNMAP"); + } + lsnmap = d; + mp_next(&d); + break; + default: + mp_next(&d); /* value */ + } + } + + /* Check Cluster-UUID */ + if (tt_uuid_cmp(&cluster_uuid, &recovery_state->cluster_uuid) != 0) { + tnt_raise(ClientError, ER_INVALID_MSGPACK, + "Unknown Cluster-UUID"); + } + /* Check Node-UUID */ + struct node *node = NULL; + uint32_t k; + mh_foreach(recovery_state->cluster, k) { + struct node *n = *mh_cluster_node(recovery_state->cluster, k); + if (tt_uuid_cmp(&n->uuid, &node_uuid) == 0) { + node = n; + break; + } + } + assert(node != NULL); + if (lsnmap == NULL) + tnt_raise(ClientError, ER_INVALID_MSGPACK, "LSNMAP"); + /* Check & save LSNMAP */ + d = lsnmap; + uint32_t lsnmap_size = mp_decode_map(&d); + struct replication_request *request = (struct replication_request *) + calloc(1, sizeof(*request) + sizeof(*request->data.lsnmap) * + (lsnmap_size + 1)); /* use calloc() for valgrind */ + + if (request == NULL) { + tnt_raise(ClientError, ER_MEMORY_ISSUE, sizeof(*request) + + sizeof(*request->data.lsnmap) * (lsnmap_size + 1), + "iproto", "SUBSCRIBE"); + } + + bool remote_found = false; + for (uint32_t i = 0; i < lsnmap_size; i++) { + if (mp_typeof(*d) != MP_UINT) { + map_error: + free(request); + tnt_raise(ClientError, ER_INVALID_MSGPACK, "LSNMAP"); + } + request->data.lsnmap[i].node_id = mp_decode_uint(&d); + if (mp_typeof(*d) != MP_UINT) + goto map_error; + request->data.lsnmap[i].lsn = mp_decode_uint(&d); + if (request->data.lsnmap[i].node_id == node->id) + remote_found = true; + } + if (!remote_found) { + /* Add remote node to the list */ + request->data.lsnmap[lsnmap_size].node_id = node->id; + request->data.lsnmap[lsnmap_size].lsn = 0; + ++lsnmap_size; + } + + request->fd = fd; + request->io.data = request; + request->data.code = packet->code; + request->data.sync = packet->sync; + request->data.node_id = node->id; + request->data.lsnmap_size = lsnmap_size; + ev_io_init(&request->io, replication_send_socket, master_to_spawner_socket, EV_WRITE); ev_io_start(loop(), &request->io); @@ -225,22 +400,27 @@ subscribe(int fd, int64_t lsn, uint64_t sync) static void replication_send_socket(ev_loop *loop, ev_io *watcher, int /* events */) { - struct subscribe_request *request = - (struct subscribe_request *) watcher->data; + struct replication_request *request = + (struct replication_request *) watcher->data; struct msghdr msg; - struct iovec iov; + struct iovec iov[2]; char control_buf[CMSG_SPACE(sizeof(int))]; + memset(control_buf, 0, sizeof(control_buf)); /* valgrind */ struct cmsghdr *control_message = NULL; - iov.iov_base = &request->lsn; - iov.iov_len = sizeof(request->lsn) + sizeof(request->sync); + size_t len = sizeof(request->data) + sizeof(*request->data.lsnmap) * + request->data.lsnmap_size; + iov[0].iov_base = &len; + iov[0].iov_len = sizeof(len); + iov[1].iov_base = &request->data; + iov[1].iov_len = len; memset(&msg, 0, sizeof(msg)); msg.msg_name = NULL; msg.msg_namelen = 0; - msg.msg_iov = &iov; - msg.msg_iovlen = 1; + msg.msg_iov = iov; + msg.msg_iovlen = nelem(iov); msg.msg_control = control_buf; msg.msg_controllen = sizeof(control_buf); @@ -347,8 +527,9 @@ spawner_main_loop() struct iovec iov; char control_buf[CMSG_SPACE(sizeof(int))]; - iov.iov_base = &replica.lsn; - iov.iov_len = sizeof(replica.lsn) + sizeof(replica.sync); + size_t len; + iov.iov_base = &len; + iov.iov_len = sizeof(len); msg.msg_name = NULL; msg.msg_namelen = 0; @@ -358,18 +539,37 @@ spawner_main_loop() msg.msg_controllen = sizeof(control_buf); while (!spawner.killed) { - int msglen = recvmsg(spawner.sock, &msg, 0); - if (msglen > 0) { - replica.sock = spawner_unpack_cmsg(&msg); - spawner_create_replication_relay(); - } else if (msglen == 0) { /* orderly master shutdown */ + ssize_t msglen = recvmsg(spawner.sock, &msg, 0); + if (msglen == 0) { /* orderly master shutdown */ say_info("Exiting: master shutdown"); break; - } else { /* msglen == -1 */ - if (errno != EINTR) - say_syserror("recvmsg"); + } else if (msglen == -1) { + if (errno == EINTR) + continue; + say_syserror("recvmsg"); /* continue, the error may be temporary */ + break; } + + replica.sock = spawner_unpack_cmsg(&msg); + struct relay_data *data = (struct relay_data *) malloc(len); + msglen = read(spawner.sock, data, len); + if (msglen == 0) { /* orderly master shutdown */ + say_info("Exiting: master shutdown"); + free(data); + break; + } else if (msglen == -1) { + free(data); + if (errno == EINTR) + continue; + say_syserror("recvmsg"); + /* continue, the error may be temporary */ + break; + } + replica.sync = data->sync; + + spawner_create_replication_relay(data); + free(data); } spawner_shutdown(); } @@ -428,7 +628,7 @@ spawner_sigchld_handler(int signo __attribute__((unused))) /** Create replication client handler process. */ static int -spawner_create_replication_relay() +spawner_create_replication_relay(struct relay_data *data) { pid_t pid = fork(); @@ -441,7 +641,7 @@ spawner_create_replication_relay() ev_loop_fork(loop()); ev_run(loop(), EVRUN_NOWAIT); close(spawner.sock); - replication_relay_loop(); + replication_relay_loop(data); } else { spawner.child_count++; close(replica.sock); @@ -543,107 +743,108 @@ replication_relay_recv(ev_loop * /* loop */, struct ev_io *w, int __attribute__( exit(EXIT_FAILURE); } -/* Only for blocked I/O */ -static inline ssize_t -sio_writev_all(int fd, struct iovec *iov, int iovcnt) -{ - ssize_t bytes_total = 0; - struct iovec *iovend = iov + iovcnt; - while(1) { - ssize_t bytes_written = sio_writev(fd, iov, iovend - iov); - bytes_total += bytes_written; - while (bytes_written > 0 && bytes_written >= iov->iov_len) - bytes_written -= (iov++)->iov_len; - if (iov == iovend) - break; - iov->iov_base = (char *) iov->iov_base + bytes_written; - iov->iov_len -= bytes_written; - } - - return bytes_total; -} - - -enum { IPROTO_ROW_IOVMAX = IPROTO_PACKET_IOVMAX + 1 }; - -static int -iproto_encode_row(const struct iproto_packet *packet, struct iovec *iov, - char fixheader[IPROTO_FIXHEADER_SIZE]) -{ - int iovcnt = iproto_packet_encode(packet, iov + 1) + 1; - uint32_t len = 0; - for (int i = 1; i < iovcnt; i++) - len += iov[i].iov_len; - - /* Encode length */ - char *data = fixheader; - data = mp_encode_uint(data, len); - /* Encode padding */ - ssize_t padding = IPROTO_FIXHEADER_SIZE - (data - fixheader); - if (padding > 0) - data = mp_encode_strl(data, padding - 1) + padding - 1; - assert(data == fixheader + IPROTO_FIXHEADER_SIZE); - iov[0].iov_base = fixheader; - iov[0].iov_len = IPROTO_FIXHEADER_SIZE; - - assert(iovcnt <= IPROTO_ROW_IOVMAX); - return iovcnt; -} - /** Send a single row to the client. */ -static int +static void replication_relay_send_row(void * /* param */, struct iproto_packet *packet) { - try { + struct recovery_state *r = recovery_state; + + /* Don't duplicate data */ + assert(r->local_node != NULL); + if (r->local_node->id == 0 || packet->node_id != r->local_node->id) { packet->sync = replica.sync; /* Encode length */ struct iovec iov[IPROTO_ROW_IOVMAX]; char fixheader[IPROTO_FIXHEADER_SIZE]; int iovcnt = iproto_encode_row(packet, iov, fixheader); sio_writev_all(replica.sock, iov, iovcnt); - } catch(SocketError *e) { - say_info("the client has closed its replication socket, exiting"); - exit(EXIT_SUCCESS); } - return 0; + /* + * Update LSN table + * This code needed to recover_remaining_wals() logic. + */ + uint32_t k = mh_cluster_find(r->cluster, packet->node_id, NULL); + struct node *node; + if (k != mh_end(r->cluster)) { + node = *mh_cluster_node(r->cluster, k); + } else { + /* Create node if it doesn't exist */ + node = (struct node *) calloc(1, sizeof(*node)); + if (node == NULL) { + tnt_raise(ClientError, ER_MEMORY_ISSUE, sizeof(node), + "r->cluster", "node"); + } + k = mh_cluster_put(r->cluster, (const struct node **) &node, + NULL, NULL); + if (k == mh_end(r->cluster)) { + tnt_raise(ClientError, ER_MEMORY_ISSUE, 0, + "r->cluster", "r->cluster"); + } + node->id = packet->node_id; + } + node->confirmed_lsn = node->current_lsn = packet->lsn; } static void -replication_relay_join(struct recovery_state *r, uint64_t sync) +replication_relay_join(struct recovery_state *r) { FDGuard guard_replica(replica.sock); - int64_t lsn = greatest_lsn(r->snap_dir); - if (lsn <= 0) - panic("can't find snapshot"); + /* Send snapshot */ + recover_snap(r); - struct log_io *snap = log_io_open_for_read(r->snap_dir, lsn, NONE); - if (snap == NULL) - panic("can't open snapshot"); - say_info("sending snapshot `%s'", snap->filename); + /* Send response to JOIN command = end of stream */ + struct iproto_packet packet; + memset(&packet, 0, sizeof(packet)); + packet.code = IPROTO_JOIN; + packet.sync = replica.sync; - /* Send rows */ - int rc = recover_wal(r, snap); - log_io_close(&snap); + char fixheader[IPROTO_FIXHEADER_SIZE]; + struct iovec iov[IPROTO_ROW_IOVMAX]; + int iovcnt = iproto_encode_row(&packet, iov, fixheader); + sio_writev_all(replica.sock, iov, iovcnt); - if (rc != 0) - panic("can't sent snapshot"); + say_info("snapshot sent"); + /* replica.sock closed by guard */ +} - /* Send response to JOIN command = end of stream */ - struct iproto_subscribe response = iproto_subscribe_stub; - response.lsn = mp_bswap_u64(lsn); - response.sync = mp_bswap_u64(sync); - sio_write(replica.sock, &response, sizeof(response)); +static void +replication_relay_subscribe(struct recovery_state *r, struct relay_data *data) +{ + assert(data->code == IPROTO_SUBSCRIBE); + /* Set LSNs */ + for (uint32_t i = 0; i < data->lsnmap_size; i++) { + struct node *node = (struct node *) calloc(1, sizeof(*node)); + if (node == NULL) + panic("cannot allocate struct node"); + node->id = data->lsnmap[i].node_id; + node->confirmed_lsn = node->current_lsn = data->lsnmap[i].lsn; + uint32_t k = mh_cluster_put(r->cluster, + (const struct node **) &node, NULL, NULL); + if (k == mh_end(r->cluster)) + panic("cannot reallocate r->cluster"); + } - say_info("snapshot sent, lsn: %" PRIi64, lsn); - exit(EXIT_SUCCESS); - /* replica.sock closed by guard */ + /* Set node */ + uint32_t k = mh_cluster_find(r->cluster, data->node_id, NULL); + assert(k != mh_end(r->cluster)); + r->local_node = *mh_cluster_node(r->cluster, k); + assert(r->local_node->id == data->node_id); + + /* Remove SNAPSHOT_NODE_ID */ + recovery_fix_lsn(r, false); + + say_warn("replication follow local"); + recovery_follow_local(r, 0.1); + ev_run(loop(), 0); + + say_crit("exiting the relay loop"); } /** The main loop of replication client service process. */ static void -replication_relay_loop() +replication_relay_loop(struct relay_data *data) { struct sigaction sa; @@ -699,26 +900,23 @@ replication_relay_loop() /* Initialize the recovery process */ recovery_init(cfg_snap_dir, cfg_wal_dir, replication_relay_send_row, - NULL, NULL, INT32_MAX); - /* - * Note that recovery starts with lsn _NEXT_ to - * the confirmed one. - */ - if (replica.lsn == 0) { - recovery_state->lsn = recovery_state->confirmed_lsn = 0; - replication_relay_join(recovery_state, replica.sync); /* exits */ - } - - recovery_state->lsn = recovery_state->confirmed_lsn = replica.lsn - 1; - recover_existing_wals(recovery_state); - /* Found nothing. */ - if (recovery_state->lsn == replica.lsn - 1) - say_error("can't find WAL containing record with lsn: %" PRIi64, - replica.lsn); - recovery_follow_local(recovery_state, 0.1); - - ev_run(loop(), 0); + NULL, NULL, NULL, INT32_MAX); + recovery_state->relay = true; /* recovery used in relay mode */ - say_crit("exiting the relay loop"); + try { + switch (data->code) { + case IPROTO_JOIN: + replication_relay_join(recovery_state); + break; + case IPROTO_SUBSCRIBE: + replication_relay_subscribe(recovery_state, data); + break; + default: + assert(false); + } + } catch(Exception *e) { + say_error("relay error: %s", e->errmsg()); + exit(EXIT_FAILURE); + } exit(EXIT_SUCCESS); } diff --git a/src/replication.h b/src/replication.h index 291f4306b8f18257b147980f95366878dce584c8..bf6e4a11607e2e33777618db4cda315d35201ad8 100644 --- a/src/replication.h +++ b/src/replication.h @@ -39,13 +39,16 @@ void replication_prefork(const char *snap_dir, const char *wal_dir); +void +replication_join(int fd, struct iproto_packet *packet); + /** * Subscribe a replica to updates. * * @return None. On error, closes the socket. */ void -subscribe(int fd, int64_t lsn, uint64_t sync); +replication_subscribe(int fd, struct iproto_packet *packet); #endif // TARANTOOL_REPLICATION_H_INCLUDED diff --git a/src/sio.h b/src/sio.h index 49a135308be2b8e95a63af5d929e25970d9b35ca..316c08bbd4900d1f173a24034932b5fbabd6a052 100644 --- a/src/sio.h +++ b/src/sio.h @@ -117,6 +117,26 @@ sio_readn(int fd, void *buf, size_t count) ssize_t sio_writen(int fd, const void *buf, size_t count); +/* Only for blocked I/O */ +static inline ssize_t +sio_writev_all(int fd, struct iovec *iov, int iovcnt) +{ + ssize_t bytes_total = 0; + struct iovec *iovend = iov + iovcnt; + while(1) { + ssize_t bytes_written = sio_writev(fd, iov, iovend - iov); + bytes_total += bytes_written; + while (bytes_written > 0 && bytes_written >= iov->iov_len) + bytes_written -= (iov++)->iov_len; + if (iov == iovend) + break; + iov->iov_base = (char *) iov->iov_base + bytes_written; + iov->iov_len -= bytes_written; + } + + return bytes_total; +} + /** * A wrapper over sendfile. * Throw if send file failed. diff --git a/src/trivia/config.h.cmake b/src/trivia/config.h.cmake index 04fa7fe30ffcf5cdd9058c924d390ad4bcd008c6..a89e0548185d34ab5749ec2e3b3c8b2e542e17bb 100644 --- a/src/trivia/config.h.cmake +++ b/src/trivia/config.h.cmake @@ -131,6 +131,10 @@ #cmakedefine HAVE_PRCTL_H 1 #cmakedefine HAVE_OPEN_MEMSTREAM 1 +#cmakedefine HAVE_FMEMOPEN 1 + +#cmakedefine HAVE_LIBUUID_LINUX 1 +#cmakedefine HAVE_LIBUUID_BSD 1 /* * predefined /etc directory prefix. diff --git a/src/trivia/util.h b/src/trivia/util.h index 7a1c9ece86773e42de504aea9175cecbf93ca816..cb071e612a39bd9b482ce4bac599ee8170c0ea24 100644 --- a/src/trivia/util.h +++ b/src/trivia/util.h @@ -193,6 +193,12 @@ FILE * open_memstream(char **ptr, size_t *sizeloc); #endif /* HAVE_OPEN_MEMSTREAM */ +#ifndef HAVE_FMEMOPEN +/* Declare open_memstream(). */ +#include <stdio.h> +FILE * +fmemopen(void *buf, size_t size, const char *mode); +#endif /* HAVE_FMEMOPEN */ #if defined(__cplusplus) } /* extern "C" */ diff --git a/src/tt_uuid.c b/src/tt_uuid.c new file mode 100644 index 0000000000000000000000000000000000000000..6127f5258a0378fe8715d5be4f8f3e18fc24b16f --- /dev/null +++ b/src/tt_uuid.c @@ -0,0 +1,40 @@ +/* + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include "tt_uuid.h" +/* Zeroed by the linker. */ +const tt_uuid uuid_nil; + +char * +tt_uuid_str(const tt_uuid *uu) +{ + static __thread char buf[UUID_STR_LEN + 1]; + tt_uuid_to_string(uu, buf); + return buf; +} + diff --git a/src/tt_uuid.h b/src/tt_uuid.h new file mode 100644 index 0000000000000000000000000000000000000000..3f584008ce7b97d323727e68f8a5131bdf179773 --- /dev/null +++ b/src/tt_uuid.h @@ -0,0 +1,166 @@ +#ifndef TARANTOOL_UUID_H_INCLUDED +#define TARANTOOL_UUID_H_INCLUDED +/* + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include <trivia/config.h> +#include <string.h> +#include <stdbool.h> +#include <stdlib.h> +#include <assert.h> + +#if defined(__cplusplus) +extern "C" { +#endif + +enum { UUID_LEN = 16, UUID_STR_LEN = 36 }; + +#if defined(HAVE_LIBUUID_LINUX) + +#include <uuid/uuid.h> + +typedef struct tt_uuid { + uuid_t id; +} tt_uuid; + +static inline void +tt_uuid_create(tt_uuid *uu) +{ + uuid_generate(uu->id); +} + +static inline int +tt_uuid_from_string(const char *in, tt_uuid *uu) +{ + return uuid_parse((char *) in, uu->id); +} + +static inline void +tt_uuid_to_string(const tt_uuid *uu, char *out) +{ + uuid_unparse(uu->id, out); +} + +static inline void +tt_uuid_dec_be(const void *in, tt_uuid *uu) +{ + memcpy(uu->id, in, sizeof(uu->id)); +} + +static inline void +tt_uuid_enc_be(const tt_uuid *uu, void *out) +{ + memcpy(out, uu->id, sizeof(uu->id)); +} + +static inline bool +tt_uuid_is_nil(const tt_uuid *uu) +{ + return uuid_is_null(uu->id); +} + +static inline bool +tt_uuid_cmp(const tt_uuid *lhs, const tt_uuid *rhs) +{ + return uuid_compare(lhs->id, rhs->id); +} + +#elif defined(HAVE_LIBUUID_BSD) + +#include <uuid.h> + +typedef struct uuid tt_uuid; + +static inline int +tt_uuid_create(tt_uuid *uu) +{ + uint32_t status; + uuid_create(uu, &status); + return status == uuid_s_ok; +} + +static inline int +tt_uuid_from_string(const char *in, tt_uuid *uu) +{ + uint32_t status; + uuid_from_string(in, uu, &status); + return status == uuid_s_ok; +} + +static inline void +tt_uuid_to_string(const tt_uuid *uu, char *out) +{ + uint32_t status; + char *buf = NULL; + uuid_to_string(uu, &buf, &status); + assert(status == uuid_s_ok); + strncpy(out, buf, UUID_STR_LEN); + out[UUID_STR_LEN] = '\0'; + free(buf); +} + +static inline bool +tt_uuid_cmp(const tt_uuid *lhs, const tt_uuid *rhs) +{ + uint32_t status; + return uuid_compare(lhs, rhs, &status); +} + +static inline bool +tt_uuid_is_nil(const tt_uuid *uu) +{ + uint32_t status; + return uuid_is_nil(uu, &status); +} + +static inline void +tt_uuid_dec_be(const void *in, tt_uuid *uu) +{ + uuid_dec_be(in, uu); + +} + +static inline void +tt_uuid_enc_be(const tt_uuid *uu, void *out) +{ + uuid_enc_be(out, uu); +} +#else +#error Unsupported libuuid +#endif /* HAVE_LIBUUID_XXX */ + +extern const tt_uuid uuid_nil; + +char * +tt_uuid_str(const tt_uuid *uu); + +#if defined(__cplusplus) +} /* extern "C" */ +#endif + +#endif /* TARANTOOL_UUID_H_INCLUDED */ diff --git a/src/util.cc b/src/util.cc index 339e5a10b5d92293193c2416082409afccf21b38..1cf563abf2e350b049c7f29cf91e3da9c9505968 100644 --- a/src/util.cc +++ b/src/util.cc @@ -202,6 +202,19 @@ fdprintf(int fd, const char *format, ...) return total; } +#ifndef HAVE_FMEMOPEN +FILE * +fmemopen(void *buf, size_t size, const char *mode) +{ + assert(strcmp(mode, "r") == 0); + + FILE *ret = tmpfile(); + fwrite(buf, 1, size, ret); + rewind(ret); + return ret; +} +#endif /* HAVE_FMEMOPEN */ + #ifdef ENABLE_BACKTRACE /* diff --git a/test/box/admin.result b/test/box/admin.result index ff9063f99d78994b0042ec57745da5f262eb8632..b115f2bbbd276306d622e949dac0aac2f634f67b 100644 --- a/test/box/admin.result +++ b/test/box/admin.result @@ -11,16 +11,19 @@ space:create_index('primary', { type = 'hash' }) box.stat() --- - DELETE: - total: 0 - rps: 0 + total: 34 + rps: 6 SELECT: total: 1 rps: 0 REPLACE: - total: 0 + total: 2 rps: 0 INSERT: - total: 2 + total: 34 + rps: 6 + AUTH: + total: 0 rps: 0 CALL: total: 0 @@ -69,16 +72,19 @@ box.cfg box.stat() --- - DELETE: - total: 0 - rps: 0 + total: 34 + rps: 6 SELECT: total: 1 rps: 0 REPLACE: - total: 0 + total: 2 rps: 0 INSERT: - total: 2 + total: 34 + rps: 6 + AUTH: + total: 0 rps: 0 CALL: total: 0 @@ -112,10 +118,10 @@ function test_box_info() local buildstr = {'flags', 'target', 'compiler', 'options'} local str = {'version', 'status' } local failed = {} - if check_type(tmp.lsn, 'cdata') == false then - table.insert(failed1, 'box.info().lsn') + if check_type(tmp.cluster, 'table') == false then + table.insert(failed1, 'box.info().cluster') else - tmp.lsn = nil + tmp.cluster = nil end for k, v in ipairs(num) do if check_type(tmp[v], 'number') == false then diff --git a/test/box/admin.test.lua b/test/box/admin.test.lua index 3b1a1f2e8496b54549b2553e21acec8ffe612116..20e454340ee04970410eeec0ad04732ace71c049 100644 --- a/test/box/admin.test.lua +++ b/test/box/admin.test.lua @@ -26,10 +26,10 @@ function test_box_info() local buildstr = {'flags', 'target', 'compiler', 'options'} local str = {'version', 'status' } local failed = {} - if check_type(tmp.lsn, 'cdata') == false then - table.insert(failed1, 'box.info().lsn') + if check_type(tmp.cluster, 'table') == false then + table.insert(failed1, 'box.info().cluster') else - tmp.lsn = nil + tmp.cluster = nil end for k, v in ipairs(num) do if check_type(tmp[v], 'number') == false then diff --git a/test/box/alter.result b/test/box/alter.result index 59bfceb3c28b222ebedfd0d72f709a4fa974b9b6..744100e6d9ba6d13c86da5454ade376dea39ed97 100644 --- a/test/box/alter.result +++ b/test/box/alter.result @@ -89,7 +89,7 @@ space = box.space[t[0]] ... space.n --- -- 313 +- 321 ... space.arity --- @@ -104,23 +104,23 @@ space.index[0] -- space:select{0} --- -- error: 'No index #0 is defined in space 313' +- error: 'No index #0 is defined in space 321' ... space:insert{0, 0} --- -- error: 'No index #0 is defined in space 313' +- error: 'No index #0 is defined in space 321' ... space:replace{0, 0} --- -- error: 'No index #0 is defined in space 313' +- error: 'No index #0 is defined in space 321' ... space:update({0}, {{'+', 0, 1}}) --- -- error: 'No index #0 is defined in space 313' +- error: 'No index #0 is defined in space 321' ... space:delete{0} --- -- error: 'No index #0 is defined in space 313' +- error: 'No index #0 is defined in space 321' ... t = _space:delete{space.n} --- @@ -134,7 +134,7 @@ space_deleted ... space:replace{0} --- -- error: Space 313 does not exist +- error: Space 321 does not exist ... _index:insert{_space.n, 0, 'primary', 'tree', 1, 1, 0, 'num'} --- @@ -167,6 +167,8 @@ _index:select{} - [304, 2, 'name', 'tree', 1, 1, 2, 'str'] - [312, 0, 'primary', 'tree', 1, 3, 1, 'num', 2, 'str', 3, 'num'] - [312, 1, 'owner', 'tree', 0, 1, 1, 'num'] + - [320, 0, 'primary', 'tree', 1, 1, 0, 'num'] + - [320, 1, 'uuid', 'tree', 1, 1, 1, 'str'] ... -- modify indexes of a system space _index:delete{_index.n, 0} diff --git a/test/box/dup_key1.xlog b/test/box/dup_key1.xlog deleted file mode 100644 index 2821f7db0b489ca138f7ce3d8869f384c1c96471..0000000000000000000000000000000000000000 Binary files a/test/box/dup_key1.xlog and /dev/null differ diff --git a/test/box/dup_key2.xlog b/test/box/dup_key2.xlog deleted file mode 100644 index 052bfac1cd3392ee0609a962720aaeebf04eb2ef..0000000000000000000000000000000000000000 Binary files a/test/box/dup_key2.xlog and /dev/null differ diff --git a/test/box/info.result b/test/box/info.result index f998b0f30d8e07d00f28dbe82b5d9d5947b6a73c..0ded19458af80e18fe432b3a746495d0c578e373 100644 --- a/test/box/info.result +++ b/test/box/info.result @@ -24,9 +24,9 @@ string.match(box.info.logger_pid, '^[0-9]+$') ~= nil --- - true ... -box.info.lsn > 0 +#box.info.cluster > 0 --- -- true +- false ... box.info.recovery_lag --- @@ -76,8 +76,9 @@ table.sort(t) t --- - - build + - cluster - logger_pid - - lsn + - node - pid - recovery_lag - recovery_last_update diff --git a/test/box/info.test.lua b/test/box/info.test.lua index 716e7bd1c6e140e68007f8ae39f5babffccb2c2b..7fbdf7893e134ed59ad33f198fe7fed232dd3e15 100644 --- a/test/box/info.test.lua +++ b/test/box/info.test.lua @@ -6,7 +6,7 @@ box.info['unknown_variable'] string.match(box.info.version, '^[1-9]') ~= nil string.match(box.info.pid, '^[1-9][0-9]*$') ~= nil string.match(box.info.logger_pid, '^[0-9]+$') ~= nil -box.info.lsn > 0 +#box.info.cluster > 0 box.info.recovery_lag box.info.recovery_last_update box.info.status diff --git a/test/box/misc.result b/test/box/misc.result index a6c0f62361a4bd710eb261e607e3f9d3ee525da8..eb279fe24f5c9d61070ba891de4500dafb339524 100644 --- a/test/box/misc.result +++ b/test/box/misc.result @@ -117,6 +117,7 @@ t; - SELECT - REPLACE - INSERT + - AUTH - CALL - UPDATE - total @@ -193,66 +194,72 @@ end; t; --- - - 'box.error.ER_CREATE_FUNCTION : 50' - - 'box.error.ER_NO_SUCH_INDEX : 35' - - 'box.error.ER_TUPLE_FOUND : 3' - - 'box.error.ER_CREATE_SPACE : 9' - 'box.error.ER_PROC_RET : 21' - 'box.error.ER_TUPLE_FORMAT_LIMIT : 16' - - 'box.error.ER_FIELD_TYPE : 23' - - 'box.error.ER_CFG : 59' - - 'box.error.ER_UNKNOWN_SCHEMA_OBJECT : 49' - - 'box.error.ER_OK : 0' - - 'box.error.ER_NO_SUCH_ENGINE : 57' + - 'box.error.ER_FUNCTION_MAX : 54' - 'box.error.ER_TUPLE_NOT_FOUND : 4' - - 'box.error.ER_INDEX_ARITY : 39' - - 'box.error.ER_WAL_IO : 40' - - 'box.error.ER_USER_MAX : 56' - - 'box.error.ER_NO_SUCH_FUNCTION : 51' - - 'box.error.ER_INJECTION : 8' - - 'box.error.ER_DROP_PRIMARY_KEY : 17' - - 'box.error.ER_INDEX_TYPE : 13' + - 'box.error.ER_PASSWORD_MISMATCH : 47' + - 'box.error.ER_LAST_DROP : 15' - 'box.error.ER_ARG_TYPE : 26' - - 'box.error.ER_FUNCTION_MAX : 54' - - 'box.error.ER_FUNCTION_ACCESS_DENIED : 53' - - 'box.error.ER_SPACE_ARITY : 38' + - 'box.error.ER_INVALID_CLUSTER : 63' - 'box.error.ER_INVALID_MSGPACK : 20' - - 'box.error.ER_SPACE_ACCESS_DENIED : 55' - - 'box.error.ER_KEY_PART_COUNT : 31' - 'box.error.ER_RELOAD_CFG : 58' - 'box.error.ER_USER_EXISTS : 46' - 'box.error.ER_MEMORY_ISSUE : 2' - 'box.error.ER_ILLEGAL_PARAMS : 1' - 'box.error.ER_KEY_FIELD_TYPE : 18' - 'box.error.ER_NONMASTER : 6' - - 'box.error.ER_UNKNOWN_REQUEST_TYPE : 48' - - 'box.error.ER_FIELD_TYPE_MISMATCH : 24' - - 'box.error.ER_MODIFY_INDEX : 14' - - 'box.error.ER_PASSWORD_MISMATCH : 47' - - 'box.error.ER_EXACT_MATCH : 19' - 'box.error.ER_NO_SUCH_USER : 45' - - 'box.error.ER_SECONDARY : 7' - - 'box.error.ER_FUNCTION_EXISTS : 52' + - 'box.error.ER_EXACT_MATCH : 19' - 'box.error.ER_CREATE_USER : 43' - - 'box.error.ER_ACCESS_DENIED : 42' - - 'box.error.ER_LAST_DROP : 15' - - 'box.error.ER_UPDATE_FIELD : 29' + - 'box.error.ER_FUNCTION_EXISTS : 52' + - 'box.error.ER_NO_SUCH_FUNCTION : 51' - 'box.error.ER_FIBER_STACK : 30' - - 'box.error.ER_UNKNOWN_UPDATE_OP : 28' - - 'box.error.ER_DROP_USER : 44' - - 'box.error.ER_UNSUPPORTED : 5' - - 'box.error.ER_NO_SUCH_FIELD : 37' + - 'box.error.ER_FUNCTION_ACCESS_DENIED : 53' + - 'box.error.ER_CFG : 59' - 'box.error.ER_TUPLE_NOT_ARRAY : 22' - - 'box.error.ER_NO_SUCH_SPACE : 36' + - 'box.error.ER_CLUSTER_ID_IS_RO : 65' - 'box.error.ER_MORE_THAN_ONE_TUPLE : 41' - - 'box.error.ER_ALTER_SPACE : 12' - - 'box.error.ER_NO_SUCH_PROC : 33' + - 'box.error.ER_NO_SUCH_SPACE : 36' + - 'box.error.ER_NO_SUCH_INDEX : 35' + - 'box.error.ER_TUPLE_FOUND : 3' + - 'box.error.ER_CREATE_SPACE : 9' + - 'box.error.ER_FIELD_TYPE : 23' + - 'box.error.ER_OK : 0' + - 'box.error.ER_INDEX_ARITY : 39' + - 'box.error.ER_WAL_IO : 40' + - 'box.error.ER_INJECTION : 8' + - 'box.error.ER_NO_SUCH_ENGINE : 57' + - 'box.error.ER_INDEX_TYPE : 13' + - 'box.error.ER_UNKNOWN_SCHEMA_OBJECT : 49' + - 'box.error.ER_SPACE_ACCESS_DENIED : 55' + - 'box.error.ER_KEY_PART_COUNT : 31' - 'box.error.ER_SPACE_EXISTS : 10' + - 'box.error.ER_UNKNOWN_NODE : 62' + - 'box.error.ER_MODIFY_INDEX : 14' + - 'box.error.ER_SECONDARY : 7' + - 'box.error.ER_NODE_ID_IS_RO : 66' + - 'box.error.ER_INVALID_UUID : 64' + - 'box.error.ER_FIELD_TYPE_MISMATCH : 24' + - 'box.error.ER_SPLICE : 25' + - 'box.error.ER_TUPLE_IS_TOO_LONG : 27' + - 'box.error.ER_DROP_SPACE : 11' + - 'box.error.ER_SPACE_ARITY : 38' + - 'box.error.ER_LOCAL_NODE_IS_NOT_ACTIVE : 61' + - 'box.error.ER_UNSUPPORTED : 5' + - 'box.error.ER_ACCESS_DENIED : 42' - 'box.error.ER_PROC_LUA : 32' + - 'box.error.ER_UPDATE_FIELD : 29' + - 'box.error.ER_NO_SUCH_FIELD : 37' + - 'box.error.ER_ALTER_SPACE : 12' + - 'box.error.ER_DROP_USER : 44' + - 'box.error.ER_UNKNOWN_UPDATE_OP : 28' + - 'box.error.ER_NO_SUCH_PROC : 33' - 'box.error.ER_SOPHIA : 60' - 'box.error.ER_NO_SUCH_TRIGGER : 34' - - 'box.error.ER_TUPLE_IS_TOO_LONG : 27' - - 'box.error.ER_SPLICE : 25' - - 'box.error.ER_DROP_SPACE : 11' + - 'box.error.ER_UNKNOWN_REQUEST_TYPE : 48' + - 'box.error.ER_DROP_PRIMARY_KEY : 17' + - 'box.error.ER_USER_MAX : 56' ... --# setopt delimiter '' -- A test case for Bug#901674 diff --git a/test/box/snapshot.test.py b/test/box/snapshot.test.py index 746b2ca35998a0b21c1eca4f07fafd77ecb6f72f..0ec7969434d83d11dded75017bceb3511db77cdb 100644 --- a/test/box/snapshot.test.py +++ b/test/box/snapshot.test.py @@ -49,7 +49,7 @@ print """ admin("space:insert{1, 'Test tuple'}") pid = int(yaml.load(admin("box.info.pid", silent=True))[0]) -lsn = yaml.load(admin("box.info.lsn", silent=True))[0] +lsn = yaml.load(admin("next(box.info.cluster)", silent=True))[1] snapshot = str(lsn).zfill(20) + ".snap" snapshot = os.path.join(server.vardir, snapshot) @@ -57,7 +57,6 @@ snapshot = os.path.join(server.vardir, snapshot) iteration = 0 MAX_ITERATIONS = 100 - while not os.access(snapshot, os.F_OK) and iteration < MAX_ITERATIONS: if iteration % 10 == 0: os.kill(pid, SIGUSR1) diff --git a/test/box/sophia.result b/test/box/sophia.result index 7cae8cef6e6fc90c5528fb2ef51db96264e78d82..0e2a3c921d3269978c0ecf19f635ad0dbec1abba 100644 --- a/test/box/sophia.result +++ b/test/box/sophia.result @@ -1,3 +1,9 @@ +os.execute("rm -rf sophia") +--- +- 0 +... +--# stop server default +--# start server default space = box.schema.create_space('tweedledum', { id = 123, engine = 'sophia' }) --- ... diff --git a/test/box/sophia.test.lua b/test/box/sophia.test.lua index 759fe750abd7e9d79163701fadeb2c02606a263f..d4bd1f220b8930db03d6f969f6a7187f5cdd173d 100644 --- a/test/box/sophia.test.lua +++ b/test/box/sophia.test.lua @@ -1,3 +1,6 @@ +os.execute("rm -rf sophia") +--# stop server default +--# start server default space = box.schema.create_space('tweedledum', { id = 123, engine = 'sophia' }) space:create_index('primary', { type = 'tree', parts = {0, 'num'} }) diff --git a/test/box/stat.result b/test/box/stat.result index a6bf3953ff026edc5174c2423131a02277a821a3..21d9e24590d41c49b6ae93f0ab7fb85fa7d1077b 100644 --- a/test/box/stat.result +++ b/test/box/stat.result @@ -15,8 +15,8 @@ for i=1,10 do space:insert{i, 'tuple'..tostring(i)} end box.stat() --- - DELETE: - total: 0 - rps: 0 + total: 14 + rps: 2 SELECT: total: 1 rps: 0 @@ -24,8 +24,11 @@ box.stat() total: 0 rps: 0 INSERT: - total: 12 - rps: 2 + total: 34 + rps: 6 + AUTH: + total: 0 + rps: 0 CALL: total: 0 rps: 0 @@ -39,8 +42,8 @@ box.stat() box.stat() --- - DELETE: - total: 0 - rps: 0 + total: 14 + rps: 2 SELECT: total: 0 rps: 0 @@ -48,6 +51,9 @@ box.stat() total: 0 rps: 0 INSERT: + total: 34 + rps: 6 + AUTH: total: 0 rps: 0 CALL: diff --git a/test/box/unfinished.xlog b/test/box/unfinished.xlog deleted file mode 100644 index 5ebfa99f87438a2dbf7f67843b2b6e22f7a88e68..0000000000000000000000000000000000000000 Binary files a/test/box/unfinished.xlog and /dev/null differ diff --git a/test/box/xlog.result b/test/box/xlog.result index ad4de5f8ec3b26afc3ea2ad5841dfc9256485797..b17f6d3d482b010082e5c53bb6acd4281edb1418 100644 --- a/test/box/xlog.result +++ b/test/box/xlog.result @@ -4,11 +4,11 @@ space = box.schema.create_space('tweedledum', { id = 0 }) --- ... -00000000000000000002.xlog.inprogress exists +.xlog.inprogress exists space:create_index('primary', { type = 'hash' }) --- ... -00000000000000000002.xlog.inprogress has been successfully renamed +.xlog.inprogress has been successfully renamed # Inprogress xlog must be renamed during regular termination. @@ -16,22 +16,72 @@ box.space[0]:insert{3, 'third tuple'} --- - [3, 'third tuple'] ... -00000000000000000004.xlog.inprogress exists -00000000000000000004.xlog.inprogress has been successfully renamed +.xlog.inprogress exists +.xlog.inprogress has been successfully renamed # An inprogress xlog file with one record must be renamed during recovery. -00000000000000000005.xlog.inprogress hash been successfully renamed - -# Empty (zero size) inprogress xlog must be deleted during recovery. - -00000000000000000006.xlog.inprogress has been successfully deleted - -# Empty (header only, no records) inprogress xlog must be deleted -# during recovery. - -00000000000000000006.xlog.inprogress has been successfully deleted +box.space[0]:insert{4, 'fourth tuple'} +--- +- [4, 'fourth tuple'] +... +box.space[0]:insert{5, 'Unfinished record'} +--- +- [5, 'Unfinished record'] +... +.xlog exists +.xlog.inprogress hash been successfully renamed +space = box.schema.create_space('test') +--- +... +box.space['test']:create_index('primary') +--- +... +box.space['test']:insert{1, 'first tuple'} +--- +- [1, 'first tuple'] +... +box.space['test']:insert{2, 'second tuple'} +--- +- [2, 'second tuple'] +... +.xlog exists +space = box.schema.create_space('test') +--- +... +box.space['test']:create_index('primary') +--- +... +box.space['test']:insert{1, 'first tuple'} +--- +- [1, 'first tuple'] +... +box.space['test']:delete{1} +--- +- [1, 'first tuple'] +... +box.space['test']:insert{1, 'third tuple'} +--- +- [1, 'third tuple'] +... +box.space['test']:insert{2, 'fourth tuple'} +--- +- [2, 'fourth tuple'] +... +.xlog exists +check log line for 'Duplicate key' -# Inprogress xlog with bad record must be deleted during recovery. +'Duplicate key' exists in server log -00000000000000000006.xlog.inprogress has been successfully deleted +box.space['test']:get{1} +--- +- [1, 'first tuple'] +... +box.space['test']:get{2} +--- +- [2, 'second tuple'] +... +box.space['test']:len() +--- +- 2 +... diff --git a/test/box/xlog.test.py b/test/box/xlog.test.py index db67d2d6e115b59bc131c7971cda6cc744846cfc..231c4e632c9ccc8f0675662cf4315e787bd638cb 100644 --- a/test/box/xlog.test.py +++ b/test/box/xlog.test.py @@ -1,153 +1,202 @@ import os import shutil +import yaml +import re from os.path import abspath # cleanup server.vardir server.stop() server.deploy() +lsn = yaml.load(server.admin("next(box.info.cluster)", silent=True))[1] server.stop() print """ # Inprogress xlog must be renamed before second insert. """ -wal_inprogress = os.path.join(server.vardir, "00000000000000000002.xlog.inprogress") -wal = os.path.join(server.vardir, "00000000000000000002.xlog") +filename = str(lsn).zfill(20) + ".xlog" +wal_inprogress = os.path.join(server.vardir, filename + ".inprogress") +wal = os.path.join(server.vardir, filename) server.start() server.admin("space = box.schema.create_space('tweedledum', { id = 0 })") if os.access(wal_inprogress, os.F_OK): - print "00000000000000000002.xlog.inprogress exists" + print ".xlog.inprogress exists" server.admin("space:create_index('primary', { type = 'hash' })") if os.access(wal, os.F_OK) and not os.access(wal_inprogress, os.F_OK): - print "00000000000000000002.xlog.inprogress has been successfully renamed" + print ".xlog.inprogress has been successfully renamed" server.stop() +lsn += 2 print """ # Inprogress xlog must be renamed during regular termination. """ +filename = str(lsn).zfill(20) + ".xlog" server.start() -wal_inprogress = os.path.join(server.vardir, "00000000000000000004.xlog.inprogress") -wal = os.path.join(server.vardir, "00000000000000000004.xlog") +wal_inprogress = os.path.join(server.vardir, filename + ".inprogress") +wal = os.path.join(server.vardir, filename) server.admin("box.space[0]:insert{3, 'third tuple'}") if os.access(wal_inprogress, os.F_OK): - print "00000000000000000004.xlog.inprogress exists" + print ".xlog.inprogress exists" server.stop() if os.access(wal, os.F_OK) and not os.access(wal_inprogress, os.F_OK): - print "00000000000000000004.xlog.inprogress has been successfully renamed" + print ".xlog.inprogress has been successfully renamed" +lsn += 1 print """ # An inprogress xlog file with one record must be renamed during recovery. """ -wal_inprogress = os.path.join(server.vardir, "00000000000000000005.xlog.inprogress") -wal = os.path.join(server.vardir, "00000000000000000005.xlog") +server.start() +filename = str(lsn).zfill(20) + ".xlog" +wal_inprogress = os.path.join(server.vardir, filename + ".inprogress") +wal = os.path.join(server.vardir, filename) +server.admin("box.space[0]:insert{4, 'fourth tuple'}") +server.admin("box.space[0]:insert{5, 'Unfinished record'}") +pid = int(yaml.load(server.admin("box.info.pid", silent=True))[0]) +from signal import SIGKILL +if pid > 0: + os.kill(pid, SIGKILL) +server.stop() -os.symlink(abspath("box/unfinished.xlog"), wal_inprogress) +if os.access(wal, os.F_OK): + print ".xlog exists" + # Remove last byte from xlog + f = open(wal, "a") + size = f.tell() + f.truncate(size - 1) + f.close() + os.rename(wal, wal_inprogress) server.start() if os.access(wal, os.F_OK) and not os.access(wal_inprogress, os.F_OK): - print "00000000000000000005.xlog.inprogress hash been successfully renamed" + print ".xlog.inprogress hash been successfully renamed" server.stop() +lsn += 1 -print """ -# Empty (zero size) inprogress xlog must be deleted during recovery. -""" - -wal_inprogress = os.path.join(server.vardir, "00000000000000000006.xlog.inprogress") -wal = os.path.join(server.vardir, "00000000000000000006.xlog") +# print """ +# # Empty (zero size) inprogress xlog must be deleted during recovery. +# """ +# +# wal_inprogress = os.path.join(server.vardir, "00000000000000000006.xlog.inprogress") +# wal = os.path.join(server.vardir, "00000000000000000006.xlog") +# +# os.symlink(abspath("box/empty.xlog"), wal_inprogress) +# server.start() +# +# if not os.access(wal_inprogress, os.F_OK) and not os.access(wal, os.F_OK): +# print "00000000000000000006.xlog.inprogress has been successfully deleted" +# server.stop() + +# print """ +# # Empty (header only, no records) inprogress xlog must be deleted +# # during recovery. +# """ +# +# # If the previous test has failed, there is a dangling link +# # and symlink fails. +# try: +# os.symlink(abspath("box/just_header.xlog"), wal_inprogress) +# except OSError as e: +# print e +# +# server.start() +# +# if not os.access(wal_inprogress, os.F_OK) and not os.access(wal, os.F_OK): +# print "00000000000000000006.xlog.inprogress has been successfully deleted" +# server.stop() + +# print """ +# # Inprogress xlog with bad record must be deleted during recovery. +# """ +# +# # If the previous test has failed, there is a dangling link +# # and symlink fails. +# try: +# os.symlink(abspath("box/bad_record.xlog"), wal_inprogress) +# except OSError as e: +# print e +# +# server.start() +# +# if not os.access(wal_inprogress, os.F_OK) and not os.access(wal, os.F_OK): +# print "00000000000000000006.xlog.inprogress has been successfully deleted" -os.symlink(abspath("box/empty.xlog"), wal_inprogress) -server.start() +#print """ +#A test case for https://bugs.launchpad.net/tarantool/+bug/1052018 +#panic_on_wal_error doesn't work for duplicate key errors +#""" -if not os.access(wal_inprogress, os.F_OK) and not os.access(wal, os.F_OK): - print "00000000000000000006.xlog.inprogress has been successfully deleted" +server.stop() +server.cfgfile_source = "box/panic_on_wal_error.cfg" +server.deploy() +lsn = yaml.load(server.admin("next(box.info.cluster)", silent=True))[1] +filename = str(lsn).zfill(20) + ".xlog" +wal_old = os.path.join(server.vardir, "old_" + filename) +wal = os.path.join(server.vardir, filename) + +# Create wal#1 +server.admin("space = box.schema.create_space('test')") +server.admin("box.space['test']:create_index('primary')") +server.admin("box.space['test']:insert{1, 'first tuple'}") +server.admin("box.space['test']:insert{2, 'second tuple'}") server.stop() -print """ -# Empty (header only, no records) inprogress xlog must be deleted -# during recovery. -""" +# Save wal #1 +if os.access(wal, os.F_OK): + print ".xlog exists" + os.rename(wal, wal_old) -# If the previous test has failed, there is a dangling link -# and symlink fails. -try: - os.symlink(abspath("box/just_header.xlog"), wal_inprogress) -except OSError as e: - print e +lsn += 4 +# Create another wal#1 server.start() - -if not os.access(wal_inprogress, os.F_OK) and not os.access(wal, os.F_OK): - print "00000000000000000006.xlog.inprogress has been successfully deleted" +server.admin("space = box.schema.create_space('test')") +server.admin("box.space['test']:create_index('primary')") +server.admin("box.space['test']:insert{1, 'first tuple'}") +server.admin("box.space['test']:delete{1}") server.stop() -print """ -# Inprogress xlog with bad record must be deleted during recovery. -""" - -# If the previous test has failed, there is a dangling link -# and symlink fails. -try: - os.symlink(abspath("box/bad_record.xlog"), wal_inprogress) -except OSError as e: - print e - +# Create wal#2 server.start() +server.admin("box.space['test']:insert{1, 'third tuple'}") +server.admin("box.space['test']:insert{2, 'fourth tuple'}") +server.stop() -if not os.access(wal_inprogress, os.F_OK) and not os.access(wal, os.F_OK): - print "00000000000000000006.xlog.inprogress has been successfully deleted" +if os.access(wal, os.F_OK): + print ".xlog exists" + # Replace wal#1 with saved copy + os.unlink(wal) + os.rename(wal_old, wal) -#print """ -#A test case for https://bugs.launchpad.net/tarantool/+bug/1052018 -#panic_on_wal_error doesn't work for duplicate key errors -#""" +f = open(server.logfile, "r") +f.seek(0, 2) -# Step-by-step instruction for log files preparation -# needed for bugtest #1052018. -# -# -# 1. box.schema.create_space('test') -# 2. box.space['test']:create_index('primary') -# 3. box.space['test']:insert{1, 'first tuple} -# 4. box.space['test']:insert{2, 'second tuple} -# 5. stop tarantool -# 6. copy xlog to dup_key1.xlog -# 7. remove xlog -# 8. start tarantool -# 9. box.schema.create_space('test') -# 10. box.space['test']:create_index('primary') -# 11. box.space['test']:insert{1, 'first tuple} -# 12. box.space['test']:delete{1} -# 13. stop tarantool -# 14. start tarantool -# 15. box.space['test']:insert{1, 'third tuple'} -# 16. box.space['test']:insert{2, 'fourth tuple'} -# 17. stop tarantool -# 18. copy xlog to dup_key2.xlog -# +server.start() -#server.stop() -#server.cfgfile_source = "box/panic_on_wal_error.cfg" -#server.deploy() -#server.stop() -#shutil.copy(abspath("box/dup_key1.xlog"), - #os.path.join(server.vardir, "00000000000000000002.xlog")) -#shutil.copy(abspath("box/dup_key2.xlog"), - #os.path.join(server.vardir, "00000000000000000004.xlog")) -#server.start() -#admin("box.space['test']:get{1}") -#admin("box.space['test']:get{2}") -#admin("box.space['test']:len()") +check="Duplicate key" +print "check log line for '%s'" % check +print +line = f.readline() +while line: + if re.search(r'(%s)' % check, line): + print "'%s' exists in server log" % check + break + line = f.readline() +print + +server.admin("box.space['test']:get{1}") +server.admin("box.space['test']:get{2}") +server.admin("box.space['test']:len()") # cleanup server.stop() diff --git a/test/lib/tarantool_server.py b/test/lib/tarantool_server.py index aa71a80ba9fd93a9d16078ec012293185d7ac6a6..cf11b0829fc88af2cdb33ac8034d9d050a503a2c 100644 --- a/test/lib/tarantool_server.py +++ b/test/lib/tarantool_server.py @@ -575,8 +575,15 @@ class TarantoolServer(Server): return yaml.load(self.admin("box.info." + param, silent=True))[0] return yaml.load(self.admin("box.info", silent=True)) - def wait_lsn(self, lsn): - while (int(self.get_param("lsn")) < lsn): + def get_lsn(self, node_uuid): + nodes = self.get_param("cluster") + if node_uuid in nodes: + return int(nodes[node_uuid]) + else: + return -1 + + def wait_lsn(self, node_uuid, lsn): + while (self.get_lsn(node_uuid) < lsn): time.sleep(0.01) def version(self): diff --git a/test/replication/hot_standby.result b/test/replication/hot_standby.result index 3b3060400dd93ab85bc9bca35a3e88ac4c2e0847..836ee4ccd1a1541ccf70d38d267dedf88ec22c9b 100644 --- a/test/replication/hot_standby.result +++ b/test/replication/hot_standby.result @@ -12,14 +12,20 @@ while box.space['_priv']:len() < 1 do box.fiber.sleep(0.01) end; --- ... do - begin_lsn = box.info.lsn + local pri_uuid = '' + local begin_lsn = 0 - function _set_pri_lsn(_lsn) + function _set_pri_lsn(_uuid, _lsn) + pri_uuid = _uuid begin_lsn = _lsn end + function _get_pri_lsn() + return box.info.cluster[pri_uuid] + end + function _print_lsn() - return (box.info.lsn - begin_lsn + 1) + return (_get_pri_lsn() - begin_lsn + 1) end function _insert(_begin, _end) @@ -39,7 +45,7 @@ do end function _wait_lsn(_lsnd) - while box.info.lsn < _lsnd + begin_lsn do + while _get_pri_lsn() < _lsnd + begin_lsn do box.fiber.sleep(0.001) end begin_lsn = begin_lsn + _lsnd @@ -51,13 +57,10 @@ end; --# set connection default -- set begin lsn on master, replica and hot_standby. --# set variable replica_port to 'replica.primary_port' -begin_lsn = box.info.lsn ---- -... a = box.net.box.new('127.0.0.1', replica_port) --- ... -a:call('_set_pri_lsn', box.info.lsn) +a:call('_set_pri_lsn', box.info.node, box.info.cluster[box.info.node]) --- - [] ... @@ -124,7 +127,7 @@ box.fiber.sleep(0.2) a = box.net.box.new('127.0.0.1', hot_standby_port) --- ... -a:call('_set_pri_lsn', box.info.lsn) +a:call('_set_pri_lsn', box.info.node, box.info.cluster[box.info.node]) --- - [] ... diff --git a/test/replication/hot_standby.test.lua b/test/replication/hot_standby.test.lua index 883f2796d0216e15bf602ee180125c0a5d5ea1a9..72e2120545b6ef8d9706a58dabeb1472948cbfbd 100644 --- a/test/replication/hot_standby.test.lua +++ b/test/replication/hot_standby.test.lua @@ -9,14 +9,20 @@ box.schema.user.grant('guest', 'read,write,execute', 'universe') --# set connection default, hot_standby, replica while box.space['_priv']:len() < 1 do box.fiber.sleep(0.01) end; do - begin_lsn = box.info.lsn + local pri_uuid = '' + local begin_lsn = 0 - function _set_pri_lsn(_lsn) + function _set_pri_lsn(_uuid, _lsn) + pri_uuid = _uuid begin_lsn = _lsn end + function _get_pri_lsn() + return box.info.cluster[pri_uuid] + end + function _print_lsn() - return (box.info.lsn - begin_lsn + 1) + return (_get_pri_lsn() - begin_lsn + 1) end function _insert(_begin, _end) @@ -36,7 +42,7 @@ do end function _wait_lsn(_lsnd) - while box.info.lsn < _lsnd + begin_lsn do + while _get_pri_lsn() < _lsnd + begin_lsn do box.fiber.sleep(0.001) end begin_lsn = begin_lsn + _lsnd @@ -47,10 +53,8 @@ end; -- set begin lsn on master, replica and hot_standby. --# set variable replica_port to 'replica.primary_port' -begin_lsn = box.info.lsn - a = box.net.box.new('127.0.0.1', replica_port) -a:call('_set_pri_lsn', box.info.lsn) +a:call('_set_pri_lsn', box.info.node, box.info.cluster[box.info.node]) a:close() space = box.schema.create_space('tweedledum') @@ -70,7 +74,7 @@ box.fiber.sleep(0.2) -- uses MASTER_PORT environment variable for its primary_port --# set variable hot_standby_port to 'hot_standby.master_port' a = box.net.box.new('127.0.0.1', hot_standby_port) -a:call('_set_pri_lsn', box.info.lsn) +a:call('_set_pri_lsn', box.info.node, box.info.cluster[box.info.node]) a:close() --# set connection hot_standby diff --git a/test/replication/init_storage.test.py b/test/replication/init_storage.test.py index 84599a8c31a8c089862e76004c6774720caaf2e5..4d13de50b5c9831d507267aa36381a25f9f1413e 100644 --- a/test/replication/init_storage.test.py +++ b/test/replication/init_storage.test.py @@ -31,7 +31,8 @@ replica.cleanup(True) master.admin('box.snapshot()') master.restart() master.admin('for k = 10, 19 do box.space[42]:insert{k, k*k*k} end') -lsn = master.get_param('lsn') +master_uuid = master.get_param('node') +lsn = master.get_lsn(master_uuid) print '-------------------------------------------------------------' print 'replica test 2 (must be ok)' print '-------------------------------------------------------------' @@ -43,7 +44,7 @@ replica.rpl_master = master replica.deploy() replica.admin('space = box.space.test'); -replica.wait_lsn(lsn) +replica.wait_lsn(master_uuid, lsn) for i in range(1, 20): replica.admin('space:get{%d}' % i) diff --git a/test/replication/status.test.py b/test/replication/status.test.py index 111259af87e50d95545430efab484eebd5e7d47e..c5385931263135c5aee80901ef228a026e9f7fd1 100644 --- a/test/replication/status.test.py +++ b/test/replication/status.test.py @@ -13,7 +13,7 @@ replica.rpl_master = master replica.vardir = os.path.join(master.vardir, 'replica') replica.deploy() -replica.get_param("lsn") +replica.get_param('node') cycles = 0 status = replica.admin.execute_no_reconnect("box.info.status", True) diff --git a/test/replication/suite.ini b/test/replication/suite.ini index 792ebe9c8090f1d94405481d532cc288c0e989f5..ab9af45495f3cdfe00ddc9f0dd7055b2afa2e9b1 100644 --- a/test/replication/suite.ini +++ b/test/replication/suite.ini @@ -2,3 +2,4 @@ core = tarantool script = master.lua description = tarantool/box, replication +disabled = consistent.test.lua diff --git a/test/replication/swap.test.py b/test/replication/swap.test.py index fa4efc728d6217d0cb82cd8cdc27fa7011a55f87..396db71e8e085c79713d7eade5426a8eb19ab385 100644 --- a/test/replication/swap.test.py +++ b/test/replication/swap.test.py @@ -10,8 +10,7 @@ def insert_tuples(_server, begin, end, msg = "tuple"): for i in range(begin, end): _server.sql("insert into t0 values (%d, '%s %d')" % (i, msg, i)) -def select_tuples(_server, begin, end, lsn): - _server.wait_lsn(lsn) +def select_tuples(_server, begin, end): for i in range(begin, end): _server.sql("select * from t0 where k0 = %d" % i) @@ -28,6 +27,10 @@ master.admin("box.schema.user.grant('guest', 'read,write,execute', 'universe')") replica.admin("while box.space['_priv']:len() < 1 do box.fiber.sleep(0.01) end") master.admin("s = box.schema.create_space('tweedledum', {id = 0})") master.admin("s:create_index('primary', {type = 'hash'})") + +master_uuid = master.get_param('node') +replica_uuid = replica.get_param('node') + id = ID_BEGIN for i in range(REPEAT): print "test %d iteration" % i @@ -35,13 +38,15 @@ for i in range(REPEAT): # insert to master insert_tuples(master, id, id + ID_STEP) # select from replica - select_tuples(replica, id, id + ID_STEP, master.get_param("lsn")) + replica.wait_lsn(master_uuid, master.get_lsn(master_uuid)) + select_tuples(replica, id, id + ID_STEP) id += ID_STEP # insert to master insert_tuples(master, id, id + ID_STEP) # select from replica - select_tuples(replica, id, id + ID_STEP, master.get_param("lsn")) + replica.wait_lsn(master_uuid, master.get_lsn(master_uuid)) + select_tuples(replica, id, id + ID_STEP) id += ID_STEP print "swap servers" @@ -57,13 +62,15 @@ for i in range(REPEAT): # insert to replica insert_tuples(replica, id, id + ID_STEP) # select from master - select_tuples(master, id, id + ID_STEP, replica.get_param("lsn")) + master.wait_lsn(replica_uuid, replica.get_lsn(replica_uuid)) + select_tuples(master, id, id + ID_STEP) id += ID_STEP # insert to replica insert_tuples(replica, id, id + ID_STEP) # select from master - select_tuples(master, id, id + ID_STEP, replica.get_param("lsn")) + master.wait_lsn(replica_uuid, replica.get_lsn(replica_uuid)) + select_tuples(master, id, id + ID_STEP) id += ID_STEP print "rollback servers configuration" diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 2aef1a7d2332bddda80ea51ac6a1e15b0012a3db..e0ba1a678e46d02f543aaadc7fd8e5d4b7f599df 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -37,6 +37,11 @@ add_executable(arena_mt.test arena_mt.c) target_link_libraries(arena_mt.test small pthread) add_executable(pt_alloc.test pt_alloc.cc) target_link_libraries(pt_alloc.test small) +add_executable(log_dir.test log_dir.cc test.c) +target_link_libraries(log_dir.test ${LIBUUID_LIBRARIES} core small salad misc bitset msgpuck + ${LIBEV_LIBRARIES} + ${LIBEIO_LIBRARIES} + ${LIBCORO_LIBRARIES}) set(MSGPUCK_DIR ${PROJECT_SOURCE_DIR}/src/lib/msgpuck/) add_executable(msgpack.test diff --git a/test/unit/log_dir.cc b/test/unit/log_dir.cc new file mode 100644 index 0000000000000000000000000000000000000000..0dcf422585b099253e648e7490b796013a877053 --- /dev/null +++ b/test/unit/log_dir.cc @@ -0,0 +1,245 @@ +#include <sys/types.h> +#include <dirent.h> + +extern "C" { +#include "test.h" +} /* extern "C" */ +#include "log_io.h" +#include "fio.h" +#include "recovery.h" /* wal_write_setlsn() */ +#include "memory.h" +#include "fiber.h" +#include "crc32.h" + +#define header() note("*** %s ***", __func__) +#define footer() note("*** %s: done ***", __func__) + +tt_uuid node_uuid; + +static void +testset_create(struct log_dir *dir, int64_t *files, int files_n, int node_n) +{ + char tpl[] = "/tmp/fileXXXXXX"; + + struct fio_batch *batch = fio_batch_alloc(1024); + assert(log_dir_create(dir) == 0); + strcpy(dir->open_wflags, "wx"); + dir->filetype = "XLOG\n"; + dir->filename_ext = ".xlog"; + dir->dirname = strdup(mkdtemp(tpl)); + dir->mode = 0660; + + struct mh_cluster_t *cluster = mh_cluster_new(); + assert(cluster != NULL); + for (int f = 0; f < files_n; f++) { + int64_t lsnsum = 0; + for (uint32_t node_id = 0; node_id < node_n; node_id++) { + int64_t lsn = *(files + f * node_n + node_id); + if (lsn <= 0) + continue; + + /* Calculate LSNSUM */ + lsnsum += lsn; + + /* Update cluster hash */ + struct node *node = (struct node *) + calloc(1, sizeof(*node)); + assert(node != NULL); + node->id = node_id; + node->current_lsn = lsn; + uint32_t k = mh_cluster_put(cluster, + (const struct node **) &node, NULL, NULL); + assert(k != mh_end(cluster)); + } + + /* Write XLOG */ + struct log_io *l = log_io_open_for_write(dir, lsnsum, &node_uuid, + INPROGRESS); + int rc = wal_write_setlsn(l, batch, cluster); + assert(rc == 0); + (void) rc; + log_io_close(&l); + mh_cluster_clean(cluster); + } + + mh_cluster_delete(cluster); + free(batch); + + int rc = log_dir_scan(dir); + assert(rc == 0); + (void) rc; + +#if 0 + diag("dir->map dump:"); + diag("file => len(lsns)"); + struct log_meta *meta = log_dir_map_first(&dir->map); + while (meta != NULL) { + diag("%lld => %u", (long long) meta->lsnsum, meta->lsn_count); + meta = log_dir_map_next(&dir->map, meta); + } + + diag("dir->lsnmap dump:"); + diag("node_id,lsn => file"); + struct log_meta_lsn *meta_lsn = log_dir_lsnmap_first(&dir->lsnmap); + while (meta_lsn != NULL) { + diag("%u,%lld => %lld", meta_lsn->node_id, + (long long) meta_lsn->lsn, + (long long) meta_lsn->meta->lsnsum); + meta_lsn = log_dir_lsnmap_next(&dir->lsnmap, meta_lsn); + } +#endif +} + +static void +testset_destroy(struct log_dir *dir) +{ + DIR *dh = opendir(dir->dirname); + assert(dh != NULL); + struct dirent *dent; + char path[PATH_MAX]; + while ((dent = readdir(dh)) != NULL) { + snprintf(path, sizeof(path), "%s/%s", dir->dirname, dent->d_name); + unlink(path); + } + closedir(dh); + rmdir(dir->dirname); + log_dir_destroy(dir); +} + + +static void +test_next(int64_t *files, int files_n, int node_n, int64_t *queries, int query_n) +{ + struct log_dir dir; + testset_create(&dir, (int64_t *) files, files_n, node_n); + + struct mh_cluster_t *cluster = mh_cluster_new(); + assert(cluster != NULL); + + for (int q = 0; q < query_n; q++) { + int64_t *query = (int64_t *) queries + q * (node_n + 1); + + /* Update cluster hash */ + for (uint32_t node_id = 0; node_id < node_n; node_id++) { + int64_t lsn = *(query + node_id); + if (lsn <= 0) + continue; + + struct node *node = (struct node *) calloc(1, sizeof(*node)); + assert(node != NULL); + node->id = node_id; + node->current_lsn = lsn; + uint32_t k = mh_cluster_put(cluster, + (const struct node **) &node, NULL, NULL); + assert(k != mh_end(cluster)); + } + + int64_t check = *(query + node_n); + int64_t value = log_dir_next(&dir, cluster); + is(value, check, "query #%d", q + 1); + mh_cluster_clean(cluster); + } + + mh_cluster_delete(cluster); + testset_destroy(&dir); +} + +static int +test1() +{ + plan(36); + header(); + + enum { NODE_N = 4}; + int64_t files[][NODE_N] = { + { 10, 0, 0, 0}, /* =10.xlog */ + { 12, 2, 0, 0}, /* =14.xlog */ + { 14, 2, 0, 0}, /* =16.xlog */ + { 14, 2, 2, 0}, /* =18.xlog */ + { 14, 4, 2, 3}, /* =23.xlog */ + { 14, 4, 2, 5}, /* =25.xlog */ + }; + enum { FILE_N = sizeof(files) / (sizeof(files[0])) }; + + int64_t queries[][NODE_N + 1] = { + /* not found (lsns are too old) */ + { 0, 0, 0, 0, /* => */ INT64_MAX}, + { 1, 0, 0, 0, /* => */ INT64_MAX}, + { 5, 0, 0, 0, /* => */ INT64_MAX}, + + /* =10.xlog (left bound) */ + { 10, 0, 0, 0, /* => */ 10}, + { 10, 1, 0, 0, /* => */ 10}, + { 10, 2, 0, 0, /* => */ 10}, + { 10, 3, 0, 0, /* => */ 10}, + { 10, 4, 0, 0, /* => */ 10}, + + /* =10.xlog (middle) */ + { 11, 0, 0, 0, /* => */ 10}, + { 11, 1, 0, 0, /* => */ 10}, + { 11, 2, 0, 0, /* => */ 10}, + { 11, 3, 0, 0, /* => */ 10}, + { 11, 4, 0, 0, /* => */ 10}, + { 11, 5, 3, 6, /* => */ 10}, + + /* =10.xlog (right bound) */ + { 12, 0, 0, 0, /* => */ 10}, + { 12, 1, 0, 0, /* => */ 10}, + { 12, 1, 1, 1, /* => */ 10}, + { 12, 1, 2, 5, /* => */ 10}, + + /* =14.xlog */ + { 12, 2, 0, 0, /* => */ 14}, + { 12, 3, 0, 0, /* => */ 14}, + { 12, 4, 0, 0, /* => */ 14}, + { 12, 5, 3, 6, /* => */ 14}, + + /* =16.xlog */ + { 14, 2, 0, 0, /* => */ 16}, + { 14, 2, 1, 0, /* => */ 16}, + { 14, 2, 0, 1, /* => */ 16}, + + /* =18.xlog */ + { 14, 2, 2, 0, /* => */ 18}, + { 14, 2, 4, 0, /* => */ 18}, + { 14, 2, 4, 3, /* => */ 18}, + { 14, 2, 4, 5, /* => */ 18}, + { 14, 4, 2, 0, /* => */ 18}, + { 14, 5, 2, 0, /* => */ 18}, + + /* =23.xlog */ + { 14, 4, 2, 3, /* => */ 23}, + { 14, 5, 2, 3, /* => */ 23}, + + /* =25.xlog */ + { 14, 4, 2, 5, /* => */ 25}, + { 14, 5, 2, 6, /* => */ 25}, + { 100, 9, 9, 9, /* => */ 25}, + }; + enum { QUERY_N = sizeof(queries) / (sizeof(queries[0])) }; + + test_next((int64_t *) files, FILE_N, NODE_N, (int64_t *) queries, QUERY_N); + + footer(); + return check_plan(); +} + +int +main(int argc, char *argv[]) +{ + (void) argc; + + say_init(argv[0]); + say_set_log_level(4); + memory_init(); + fiber_init(); + crc32_init(); + tt_uuid_create(&node_uuid); + + plan(1); + test1(); + + fiber_free(); + memory_free(); + return check_plan(); +} diff --git a/test/unit/log_dir.result b/test/unit/log_dir.result new file mode 100644 index 0000000000000000000000000000000000000000..76c26b37a7c1b6b648b821e01e2637439b5d3781 --- /dev/null +++ b/test/unit/log_dir.result @@ -0,0 +1,41 @@ +1..1 + 1..36 + # *** test1 *** + ok 1 - query #1 + ok 2 - query #2 + ok 3 - query #3 + ok 4 - query #4 + ok 5 - query #5 + ok 6 - query #6 + ok 7 - query #7 + ok 8 - query #8 + ok 9 - query #9 + ok 10 - query #10 + ok 11 - query #11 + ok 12 - query #12 + ok 13 - query #13 + ok 14 - query #14 + ok 15 - query #15 + ok 16 - query #16 + ok 17 - query #17 + ok 18 - query #18 + ok 19 - query #19 + ok 20 - query #20 + ok 21 - query #21 + ok 22 - query #22 + ok 23 - query #23 + ok 24 - query #24 + ok 25 - query #25 + ok 26 - query #26 + ok 27 - query #27 + ok 28 - query #28 + ok 29 - query #29 + ok 30 - query #30 + ok 31 - query #31 + ok 32 - query #32 + ok 33 - query #33 + ok 34 - query #34 + ok 35 - query #35 + ok 36 - query #36 + # *** test1: done *** +ok 1 - subtests diff --git a/test/unit/test.h b/test/unit/test.h index 06558c2354f8a64a4fb241bac822c36fe19c4ac2..55f0f17bd661b48a7840b6e45cb9b75fdd3dea23 100644 --- a/test/unit/test.h +++ b/test/unit/test.h @@ -27,7 +27,7 @@ int _ok(int condition, const char *fmt, ...); /* private function, use note(...) or diag(...) instead */ void _space(FILE *stream); -#define msg(stream, ...) ({ __space(stream); fprintf(stream, "# "); \ +#define msg(stream, ...) ({ _space(stream); fprintf(stream, "# "); \ fprintf(stream, __VA_ARGS__); fprintf(stream, "\n"); }) #define note(...) msg(stdout, __VA_ARGS__) diff --git a/test/wal/alter.result b/test/wal/alter.result index d417108da16d0b331d1a027e96a6ddc322039de8..4ff8a354729c9ff2a100242aff4efd4e0612facf 100644 --- a/test/wal/alter.result +++ b/test/wal/alter.result @@ -17,7 +17,7 @@ end; ... #spaces; --- -- 65523 +- 65522 ... -- cleanup for k, v in pairs(spaces) do