From 8d0ea5870f92fc1022355458f1e5457f853c239c Mon Sep 17 00:00:00 2001 From: Konstantin Osipov <kostja@tarantool.org> Date: Thu, 20 Jul 2017 21:31:04 +0300 Subject: [PATCH] vinyl: add comments to vy_write_iterator --- src/box/vinyl.c | 6 +- src/box/vy_write_iterator.c | 316 +++++++++++++++++----------------- src/box/vy_write_iterator.h | 175 +++++++++++-------- test/unit/vy_write_iterator.c | 2 +- 4 files changed, 263 insertions(+), 236 deletions(-) diff --git a/src/box/vinyl.c b/src/box/vinyl.c index 54e885467f..8349782d0b 100644 --- a/src/box/vinyl.c +++ b/src/box/vinyl.c @@ -953,7 +953,7 @@ vy_task_dump_new(struct vy_scheduler *scheduler, struct vy_index *index, rlist_foreach_entry(mem, &index->sealed, in_sealed) { if (mem->generation > scheduler->dump_generation) continue; - if (vy_write_iterator_add_mem(wi, mem) != 0) + if (vy_write_iterator_new_mem(wi, mem) != 0) goto err_wi_sub; } @@ -1210,7 +1210,7 @@ vy_task_compact_new(struct vy_scheduler *scheduler, struct vy_index *index, struct vy_slice *slice; int n = range->compact_priority; rlist_foreach_entry(slice, &range->slices, in_range) { - if (vy_write_iterator_add_slice(wi, slice, + if (vy_write_iterator_new_slice(wi, slice, &scheduler->env->run_env) != 0) goto err_wi_sub; @@ -4067,7 +4067,7 @@ vy_send_range(struct vy_join_ctx *ctx) struct vy_slice *slice; rlist_foreach_entry(slice, &ctx->slices, in_join) { - if (vy_write_iterator_add_slice(ctx->wi, slice, + if (vy_write_iterator_new_slice(ctx->wi, slice, &ctx->env->run_env) != 0) goto out_delete_wi; } diff --git a/src/box/vy_write_iterator.c b/src/box/vy_write_iterator.c index 7c64449c65..4a50041175 100644 --- a/src/box/vy_write_iterator.c +++ b/src/box/vy_write_iterator.c @@ -41,29 +41,32 @@ static bool heap_less(heap_t *heap, struct heap_node *n1, struct heap_node *n2); -#define HEAP_NAME src_heap +#define HEAP_NAME vy_source_heap #define HEAP_LESS heap_less #include "salad/heap.h" /** - * Merge source of write iterator. Represents a mem or a run. + * Merge source of a write iterator. Represents a mem or a run. */ struct vy_write_src { /* Link in vy_write_iterator::src_list */ struct rlist in_src_list; /* Node in vy_write_iterator::src_heap */ struct heap_node heap_node; - /* Current tuple in the source (lowest and with maximal lsn) */ + /* Current tuple in the source (with minimal key and maximal LSN) */ struct tuple *tuple; /** - * There are special rules of comparison for virtual sources - * that represent a delimiter between the current key and - * the next key. They must be after (greater than) the sources with - * equal key despite of LSN. The flag below mean that the source is - * such a virtual source and must be compared correspondingly. + * If this flag is set, this is a so called "virtual" + * source. A virtual source does not stand for any mem or + * run, but represents a delimiter between the current key + * and the next one. There is a special rule used by the + * write iterator heap when comparing with a virtual + * source. Such source is greater than any source with + * the same key and less than any source with a greater + * key, regardless of LSN. */ bool is_end_of_key; - /** Source iterator */ + /** An iterator over the source */ union { struct vy_slice_stream slice_stream; struct vy_mem_stream mem_stream; @@ -72,26 +75,28 @@ struct vy_write_src { }; /** - * Sequence of verions of a key, sorted by LSN in ascending order. + * A sequence of versions of a key, sorted by LSN in ascending order. * (history->tuple.lsn < history->next->tuple.lsn). */ struct vy_write_history { /** Next version with greater LSN. */ struct vy_write_history *next; - /** Key version. */ + /** Key. */ struct tuple *tuple; }; /** * Create a new vy_write_history object, save a statement into it - * and link with a newer version. + * and link with a newer version. This function effectively + * reverses key LSN order from newest first to oldest first, i.e. + * orders statements on the same key chronologically. * * @param region Allocator for the object. * @param tuple Key version. * @param next Next version of the key. * - * @return not NULL Created object. - * @return NULL Memory error. + * @retval not NULL Created object. + * @retval NULL Memory error. */ static inline struct vy_write_history * vy_write_history_new(struct region *region, struct tuple *tuple, @@ -110,8 +115,8 @@ vy_write_history_new(struct region *region, struct tuple *tuple, } /** - * Clear entire sequence of versions of a key. Free resources of - * each version. + * Clear an entire sequence of versions of a key. Free resources + * of each version. * @param history History to clear. */ static inline void @@ -128,11 +133,12 @@ vy_write_history_destroy(struct vy_write_history *history) struct vy_read_view_stmt { /** Read view LSN. */ int64_t vlsn; - /** Result key version, visible for the @vlsn. */ + /** Result key version, visible to this @vlsn. */ struct tuple *tuple; /** - * Sequence of key versions. It is merged at the end of - * the key building into @tuple. + * A history of changes building up to this read + * view. Once built, it is merged into a single + * @tuple. */ struct vy_write_history *history; }; @@ -159,53 +165,51 @@ struct vy_write_iterator { struct vy_stmt_stream base; /* List of all sources of the iterator */ struct rlist src_list; - /* Heap with sources with the lowest source in head */ + /* A heap to order the sources, newest LSN at heap top. */ heap_t src_heap; - /** Index key definition used for storing statements on disk. */ + /** Index key definition used to store statements on disk. */ const struct key_def *key_def; /** Format to allocate new REPLACE and DELETE tuples from vy_run */ struct tuple_format *format; /** Same as format, but for UPSERT tuples. */ struct tuple_format *upsert_format; - /* There are is no level older than the one we're writing to. */ + /* There is no LSM tree level older than the one we're writing to. */ bool is_last_level; - /** Set if this iterator is for a primary index. */ + /** + * Set if this iterator is for a primary index. + * Not all implementation are applicable to the primary + * key and its tuple format is different. + */ bool is_primary; /** Length of the @read_views. */ int rv_count; - /** Count of not empty read views. */ + /** + * If there are no changes between two read views, the + * newer read view is left empty. This is a count of + * non-empty read views. It's used to speed up squashing. + */ int rv_used_count; /** - * Current read view statement in @read_views. It is used - * to return key versions one by one from - * vy_write_iterator_next. + * Current read view in @read_views. It is used to return + * key versions one by one from vy_write_iterator_next. */ int stmt_i; /** - * Read views of the same key, sorted by lsn in - * descending order and started from the INT64_MAX. Each - * is referenced if needed. Example: - * stmt_count = 3 - * rv_count = 6 - * 0 1 2 3 4 5 - * [lsn=6, lsn=5, lsn=4, -, -, -] + * Read views of the same key sorted by LSN in descending + * order, starting from INT64_MAX. * - * @Read_views can have gaps, if there are read views with - * the same key versions. Example: - * - * LSN: 20 - - - 10 9 8 - * Read views: * * * * * - * @read_views array: [lsn=20, -, -, -, lsn=10, -, -] - * \___________________/ - * Same versions of the key. + * Some read views in @read_views can be empty, + * - if there are no changes since the previous read view + * - if there are no changes up until this read view since + * the beginning of time. */ struct vy_read_view_stmt read_views[0]; }; /** - * Comparator of the heap. Generally compares two sources and finds out - * whether one source is less than another. + * Comparator of the heap. Put newer LSNs first, unless + * it's a virtual source (is_end_of_key). */ static bool heap_less(heap_t *heap, struct heap_node *node1, struct heap_node *node2) @@ -222,24 +226,26 @@ heap_less(heap_t *heap, struct heap_node *node1, struct heap_node *node2) return cmp < 0; /** - * Keys are equal, order by lsn, descending. - * Virtual sources that represents end-of-key just use 0 as LSN, - * so they are after all equal keys automatically. + * Keys are equal, order by LSN, descending. + * Virtual sources use 0 for LSN, so they are ordered + * last automatically. */ int64_t lsn1 = src1->is_end_of_key ? 0 : vy_stmt_lsn(src1->tuple); int64_t lsn2 = src2->is_end_of_key ? 0 : vy_stmt_lsn(src2->tuple); if (lsn1 != lsn2) return lsn1 > lsn2; - /** LSNs are equal, prioritize terminal (non-upsert) statements */ + /** + * LSNs are equal. This may happen only during forced recovery. + * Prioritize terminal (non-UPSERT) statements + */ return (vy_stmt_type(src1->tuple) == IPROTO_UPSERT ? 1 : 0) < (vy_stmt_type(src2->tuple) == IPROTO_UPSERT ? 1 : 0); } /** - * Allocate a source and put it to the list. - * The underlying stream (src->stream) must be opened immediately. + * Allocate a source and add it to a write iterator. * @param stream - the write iterator. * @return the source or NULL on memory error. */ @@ -249,7 +255,7 @@ vy_write_iterator_new_src(struct vy_write_iterator *stream) struct vy_write_src *res = (struct vy_write_src *) malloc(sizeof(*res)); if (res == NULL) { diag_set(OutOfMemory, sizeof(*res), - "malloc", "write stream src"); + "malloc", "vinyl write stream"); return NULL; } res->is_end_of_key = false; @@ -258,9 +264,7 @@ vy_write_iterator_new_src(struct vy_write_iterator *stream) } -/** - * Close the stream of the source, remove from list and delete. - */ +/** Close a stream, remove it from the write iterator and delete. */ static void vy_write_iterator_delete_src(struct vy_write_iterator *stream, struct vy_write_src *src) @@ -276,7 +280,9 @@ vy_write_iterator_delete_src(struct vy_write_iterator *stream, } /** - * Put the source to the heap. Source's stream must be opened. + * Add a source to the write iterator heap. The added source + * must be open. + * * @return 0 - success, not 0 - error. */ static NODISCARD int @@ -295,10 +301,10 @@ vy_write_iterator_add_src(struct vy_write_iterator *stream, vy_write_iterator_delete_src(stream, src); return rc; } - rc = src_heap_insert(&stream->src_heap, &src->heap_node); + rc = vy_source_heap_insert(&stream->src_heap, &src->heap_node); if (rc != 0) { diag_set(OutOfMemory, sizeof(void *), - "malloc", "write stream heap"); + "malloc", "vinyl write stream heap"); vy_write_iterator_delete_src(stream, src); return rc; } @@ -306,13 +312,13 @@ vy_write_iterator_add_src(struct vy_write_iterator *stream, } /** - * Remove the source from the heap, destroy and free it. + * Remove a source from the heap, destroy and free it. */ static void vy_write_iterator_remove_src(struct vy_write_iterator *stream, struct vy_write_src *src) { - src_heap_delete(&stream->src_heap, &src->heap_node); + vy_source_heap_delete(&stream->src_heap, &src->heap_node); vy_write_iterator_delete_src(stream, src); } @@ -329,7 +335,7 @@ vy_write_iterator_new(const struct key_def *key_def, struct tuple_format *format bool is_last_level, struct rlist *read_views) { /* - * One is reserved for the INT64_MAX - maximal read view. + * One is reserved for INT64_MAX - maximal read view. */ int count = 1; struct rlist *unused; @@ -354,7 +360,7 @@ vy_write_iterator_new(const struct key_def *key_def, struct tuple_format *format assert(count == 0); stream->base.iface = &vy_slice_stream_iface; - src_heap_create(&stream->src_heap); + vy_source_heap_create(&stream->src_heap); rlist_create(&stream->src_list); stream->key_def = key_def; stream->format = format; @@ -367,7 +373,7 @@ vy_write_iterator_new(const struct key_def *key_def, struct tuple_format *format } /** - * Start the search. Must be called after *add* methods and + * Start the search. Must be called after *new* methods and * before *next* method. * @return 0 on success or not 0 on error (diag is set). */ @@ -415,10 +421,10 @@ vy_write_iterator_close(struct vy_stmt_stream *vstream) /** * Add a mem as a source of iterator. - * @return 0 on success or not 0 on error (diag is set). + * @return 0 on success or -1 on error (diag is set). */ NODISCARD int -vy_write_iterator_add_mem(struct vy_stmt_stream *vstream, struct vy_mem *mem) +vy_write_iterator_new_mem(struct vy_stmt_stream *vstream, struct vy_mem *mem) { struct vy_write_iterator *stream = (struct vy_write_iterator *)vstream; struct vy_write_src *src = vy_write_iterator_new_src(stream); @@ -430,10 +436,10 @@ vy_write_iterator_add_mem(struct vy_stmt_stream *vstream, struct vy_mem *mem) /** * Add a run slice as a source of iterator. - * @return 0 on success or not 0 on error (diag is set). + * @return 0 on success or -1 on error (diag is set). */ NODISCARD int -vy_write_iterator_add_slice(struct vy_stmt_stream *vstream, +vy_write_iterator_new_slice(struct vy_stmt_stream *vstream, struct vy_slice *slice, struct vy_run_env *run_env) { struct vy_write_iterator *stream = (struct vy_write_iterator *)vstream; @@ -447,13 +453,13 @@ vy_write_iterator_add_slice(struct vy_stmt_stream *vstream, } /** - * Go to next tuple in terms of sorted (merged) input steams. + * Go to the next tuple in terms of sorted (merged) input steams. * @return 0 on success or not 0 on error (diag is set). */ static NODISCARD int vy_write_iterator_merge_step(struct vy_write_iterator *stream) { - struct heap_node *node = src_heap_top(&stream->src_heap); + struct heap_node *node = vy_source_heap_top(&stream->src_heap); assert(node != NULL); struct vy_write_src *src = container_of(node, struct vy_write_src, heap_node); @@ -461,7 +467,7 @@ vy_write_iterator_merge_step(struct vy_write_iterator *stream) if (rc != 0) return rc; if (src->tuple != NULL) - src_heap_update(&stream->src_heap, node); + vy_source_heap_update(&stream->src_heap, node); else vy_write_iterator_remove_src(stream, src); return 0; @@ -500,13 +506,13 @@ vy_write_iterator_get_vlsn(struct vy_write_iterator *stream, int rv_i) static inline int vy_write_iterator_push_rv(struct region *region, struct vy_write_iterator *stream, - struct vy_write_src *src, int current_rv_i) + struct tuple *tuple, int current_rv_i) { assert(current_rv_i < stream->rv_count); struct vy_read_view_stmt *rv = &stream->read_views[current_rv_i]; - assert(rv->vlsn >= vy_stmt_lsn(src->tuple)); + assert(rv->vlsn >= vy_stmt_lsn(tuple)); struct vy_write_history *h = - vy_write_history_new(region, src->tuple, rv->history); + vy_write_history_new(region, tuple, rv->history); if (h == NULL) return -1; rv->history = h; @@ -518,8 +524,8 @@ vy_write_iterator_push_rv(struct region *region, * statements sequence. Unref the previous statement, if needed. * We can't unref the statement right before returning it to the * caller, because reference in the read_views array can be - * single reference of this statement, and unref could delete it - * before returning. + * the only one to this statement, e.g. if the statement is + * read from a disk page. * * @param stream Write iterator. * @retval not NULL Next statement of the current key. @@ -530,14 +536,14 @@ vy_write_iterator_pop_read_view_stmt(struct vy_write_iterator *stream) { struct vy_read_view_stmt *rv; if (stream->stmt_i >= 0) { - /* Destroy the current before getting the next. */ + /* Destroy the current before getting to the next. */ rv = &stream->read_views[stream->stmt_i]; assert(rv->history == NULL); vy_read_view_stmt_destroy(rv); } if (stream->rv_used_count == 0) return NULL; - /* Find a next not empty history element. */ + /* Find a next non-empty history element. */ do { assert(stream->stmt_i + 1 < stream->rv_count); stream->stmt_i++; @@ -550,16 +556,18 @@ vy_write_iterator_pop_read_view_stmt(struct vy_write_iterator *stream) } /** - * Build the history of the current key. During the history - * building already some optimizations can be applied - - * @sa optimizations 1, 2 and 3 in vy_write_iterator.h. - * During building of a key history, some statements can be - * skipped, but no one can be merged. - * UPSERTs merge is executed on a special 'merge' phase. + * Build the history of the current key. + * Apply optimizations 1, 2 and 3 (@sa vy_write_iterator.h). + * When building a history, some statements can be + * skipped (e.g. multiple REPLACE statements on the same key), + * but nothing can be merged yet, since we don't know the first + * statement in the history. + * This is why there is a special "merge" step which applies + * UPSERTs and builds a tuple for each read view. * * @param region History objects allocator. * @param stream Write iterator. - * @param[out] count Count of statements, saved in a history. + * @param[out] count Count of statements saved in the history. * * @retval 0 Success. * @retval -1 Memory error. @@ -570,34 +578,35 @@ vy_write_iterator_build_history(struct region *region, { *count = 0; assert(stream->stmt_i == -1); - struct heap_node *node = src_heap_top(&stream->src_heap); + struct heap_node *node = vy_source_heap_top(&stream->src_heap); if (node == NULL) return 0; /* no more data */ struct vy_write_src *src = container_of(node, struct vy_write_src, heap_node); - /* Search must be started in the task. */ + /* Search must have been started already. */ assert(src->tuple != NULL); /* - * A virtual source instance that represents the end on current key in - * source heap. Due to a special branch in heap's comparator the - * source will come into the heap head after all equal to the current - * key statement but before any greater statement. Having inserted - * the source to the heap, the moment we get this source from the heap - * signals that there are no statements that are equal to the current. + * A virtual source instance which represents the end on + * the current key in the source heap. It is greater + * than any statement on the current key and less than + * any statement on the next key. + * The moment we get this source from the heap we know + * that there are no statements that there are no more + * statements for the current key. */ struct vy_write_src end_of_key_src; end_of_key_src.is_end_of_key = true; end_of_key_src.tuple = src->tuple; - int rc = src_heap_insert(&stream->src_heap, &end_of_key_src.heap_node); + int rc = vy_source_heap_insert(&stream->src_heap, &end_of_key_src.heap_node); if (rc) { diag_set(OutOfMemory, sizeof(void *), - "malloc", "write stream heap"); + "malloc", "vinyl write stream heap"); return rc; } vy_stmt_ref_if_possible(src->tuple); /* * For each pair (merge_until_lsn, current_rv_lsn] build - * history of a corresponding read view. + * a history in the corresponding read view. * current_rv_i - index of the current read view. */ int current_rv_i = 0; @@ -606,28 +615,26 @@ vy_write_iterator_build_history(struct region *region, uint64_t key_mask = stream->key_def->column_mask; while (true) { - /* - * Skip statements, unvisible by the current read - * view and unused by the previous read view. - */ - if (vy_stmt_lsn(src->tuple) > current_rv_lsn) - goto next_step; - /* - * The iterator reached the border of two read - * views. - */ + if (vy_stmt_lsn(src->tuple) > current_rv_lsn) { + /* + * Skip statements invisible to the current read + * view but older than the previous read view, + * which is already fully built. + */ + goto next_lsn; + } while (vy_stmt_lsn(src->tuple) <= merge_until_lsn) { /* - * Skip read views, which have the same - * versions of the key. - * The src->tuple must be between - * merge_until_lsn and current_rv_lsn. + * Skip read views which see the same + * version of the key, until src->tuple is + * between merge_until_lsn and + * current_rv_lsn. */ current_rv_i++; current_rv_lsn = merge_until_lsn; - int n = current_rv_i + 1; merge_until_lsn = - vy_write_iterator_get_vlsn(stream, n); + vy_write_iterator_get_vlsn(stream, + current_rv_i + 1); } /* @@ -637,25 +644,27 @@ vy_write_iterator_build_history(struct region *region, */ if (vy_stmt_type(src->tuple) == IPROTO_DELETE && stream->is_last_level && merge_until_lsn == 0) { - current_rv_lsn = 0; - goto next_step; + current_rv_lsn = 0; /* Force skip */ + goto next_lsn; } /* - * Optimization 2: skip after REPLACE/DELETE. + * Optimization 2: skip statements overwritten + * by a REPLACE or DELETE. */ if (vy_stmt_type(src->tuple) == IPROTO_REPLACE || vy_stmt_type(src->tuple) == IPROTO_DELETE) { uint64_t stmt_mask = vy_stmt_column_mask(src->tuple); /* - * Optimization 3: skip statements, which - * do not update the secondary key. + * Optimization 3: skip statements which + * do not change this secondary key. */ if (!stream->is_primary && key_update_can_be_skipped(key_mask, stmt_mask)) - goto next_step; + goto next_lsn; - rc = vy_write_iterator_push_rv(region, stream, src, + rc = vy_write_iterator_push_rv(region, stream, + src->tuple, current_rv_i); if (rc != 0) break; @@ -665,20 +674,20 @@ vy_write_iterator_build_history(struct region *region, merge_until_lsn = vy_write_iterator_get_vlsn(stream, current_rv_i + 1); - goto next_step; + goto next_lsn; } assert(vy_stmt_type(src->tuple) == IPROTO_UPSERT); - rc = vy_write_iterator_push_rv(region, stream, src, + rc = vy_write_iterator_push_rv(region, stream, src->tuple, current_rv_i); if (rc != 0) break; ++*count; -next_step: +next_lsn: rc = vy_write_iterator_merge_step(stream); if (rc != 0) break; - node = src_heap_top(&stream->src_heap); + node = vy_source_heap_top(&stream->src_heap); assert(node != NULL); src = container_of(node, struct vy_write_src, heap_node); assert(src->tuple != NULL); @@ -686,7 +695,7 @@ vy_write_iterator_build_history(struct region *region, break; } - src_heap_delete(&stream->src_heap, &end_of_key_src.heap_node); + vy_source_heap_delete(&stream->src_heap, &end_of_key_src.heap_node); vy_stmt_unref_if_possible(end_of_key_src.tuple); return rc; } @@ -694,18 +703,18 @@ vy_write_iterator_build_history(struct region *region, /** * Apply accumulated UPSERTs in the read view with a hint from * a previous read view. After merge, the read view must contain - * single statement. + * one statement. * * @param stream Write iterator. - * @param previous_version Hint from a previous read view. + * @param hint The tuple from a previous read view (can be NULL). * @param rv Read view to merge. * * @retval 0 Success. * @retval -1 Memory error. */ static NODISCARD int -vy_read_view_merge(struct vy_write_iterator *stream, - struct tuple *previous_version, struct vy_read_view_stmt *rv) +vy_read_view_merge(struct vy_write_iterator *stream, struct tuple *hint, + struct vy_read_view_stmt *rv) { assert(rv != NULL); assert(rv->tuple == NULL); @@ -713,20 +722,20 @@ vy_read_view_merge(struct vy_write_iterator *stream, struct vy_write_history *h = rv->history; /* * Two possible hints to remove the current UPSERT. - * 1. If the previous read view contains DELETE or + * 1. If the stream is working on the last level, we + * know that this UPSERT is the oldest version of + * the key and can convert it into REPLACE. + * 2. If the previous read view contains DELETE or * REPLACE, then the current UPSERT can be applied to - * it, whether is_last_level true or not. - * 2. If the stream is working on the last level. Then we - * are sure the UPSERT to be oldest version of a key - * and it can be turned into REPLACE. + * it, whether is_last_level is true or not. */ if (vy_stmt_type(h->tuple) == IPROTO_UPSERT && - (stream->is_last_level || (previous_version != NULL && - vy_stmt_type(previous_version) != IPROTO_UPSERT))) { - assert(!stream->is_last_level || previous_version == NULL || - vy_stmt_type(previous_version) != IPROTO_UPSERT); + (stream->is_last_level || (hint != NULL && + vy_stmt_type(hint) != IPROTO_UPSERT))) { + assert(!stream->is_last_level || hint == NULL || + vy_stmt_type(hint) != IPROTO_UPSERT); struct tuple *applied = - vy_apply_upsert(h->tuple, previous_version, + vy_apply_upsert(h->tuple, hint, stream->key_def, stream->format, stream->upsert_format, false); if (applied == NULL) @@ -734,7 +743,7 @@ vy_read_view_merge(struct vy_write_iterator *stream, vy_stmt_unref_if_possible(h->tuple); h->tuple = applied; } - /* Squash rest of UPSERTs. */ + /* Squash the rest of UPSERTs. */ struct vy_write_history *result = h; h = h->next; while (h != NULL) { @@ -749,16 +758,11 @@ vy_read_view_merge(struct vy_write_iterator *stream, return -1; vy_stmt_unref_if_possible(result->tuple); result->tuple = applied; + vy_stmt_unref_if_possible(h->tuple); /* - * Before: - * result -> h -> next - * - * Will be truncated by region. - * After: / - * result -. h .-> next - * \_ _ _ _ / + * Don't bother freeing 'h' since it's + * allocated on a region. */ - vy_stmt_unref_if_possible(h->tuple); h = h->next; result->next = h; } @@ -770,9 +774,9 @@ vy_read_view_merge(struct vy_write_iterator *stream, } /** - * Split the current key into the sequence of the read view + * Split the current key into a sequence of read view * statements. @sa struct vy_write_iterator comment for details - * about algorithm and optimizations. + * about the algorithm and optimizations. * * @param stream Write iterator. * @param[out] count Length of the result key versions sequence. @@ -795,7 +799,7 @@ vy_write_iterator_build_read_views(struct vy_write_iterator *stream, int *count) region_truncate(region, used); return 0; } - /* Find a first not empty read view. */ + /* Find the first non-empty read view. */ struct vy_read_view_stmt *rv = &stream->read_views[stream->rv_count - 1]; while (rv > &stream->read_views[0] && rv->history == NULL) @@ -805,15 +809,15 @@ vy_write_iterator_build_read_views(struct vy_write_iterator *stream, int *count) * here > 0. */ assert(rv >= &stream->read_views[0] && rv->history != NULL); - struct tuple *previous_version = NULL; + struct tuple *hint = NULL; for (; rv >= &stream->read_views[0]; --rv) { if (rv->history == NULL) continue; - if (vy_read_view_merge(stream, previous_version, rv) != 0) + if (vy_read_view_merge(stream, hint, rv) != 0) goto error; stream->rv_used_count++; ++*count; - previous_version = rv->tuple; + hint = rv->tuple; assert(rv->history == NULL); } region_truncate(region, used); @@ -851,21 +855,19 @@ vy_write_iterator_next(struct vy_stmt_stream *vstream, int count = 0; while (true) { - /* Squash upserts and/or go to the next key */ + /* Squash UPSERTs and/or go to the next key */ if (vy_write_iterator_build_read_views(stream, &count) != 0) return -1; /* - * Next_key() routine could skip the next key, for + * next_key() routine could skip the next key, for * example, if it was truncated by last level * DELETE or it consisted only from optimized - * updates. Then try get the next key. + * updates. Then try to get the next key. */ if (count != 0 || stream->src_heap.size == 0) break; } - /* - * Again try to get the statement, after calling next_key. - */ + /* Again try to get the statement, after calling next_key(). */ *ret = vy_write_iterator_pop_read_view_stmt(stream); return 0; } diff --git a/src/box/vy_write_iterator.h b/src/box/vy_write_iterator.h index 7d5d347fae..07c8bb5725 100644 --- a/src/box/vy_write_iterator.h +++ b/src/box/vy_write_iterator.h @@ -42,99 +42,124 @@ * * Background * ---------- - * With no loss of generality, lets consider the write_iterator to - * have a single statements source (struct vy_write_src). Each key - * consists of an LSNs sequence. A range of control points also - * exists, named read views, each of which is characterized by - * their VLSN. By default, for the biggest VLSN INT64_MAX is used, - * and for the smallest one 0 is used: + * The write iterator merges multiple data sources into one, + * ordering statements by key and then by LSN and purging + * unnecessary changes. + * + * The sources supply statements in ascending order of the + * key and descending order of LSN (newest changes first). + * A heap is used to preserve descending order of LSNs + * in the output. + * + * There may be many statements for the same key, forming + * a history. + * + * The iterator needs to preserve only the statements + * which are visible to the active read views, each represented + * by a view LSN (VLSN) and purge the rest. + * + * The list of read views always contains at least the "current" + * read view, represented by INT64_MAX. 0 stands for the oldest + * possible LSN: * * [0, vlsn1, vlsn2, vlsn3, ... INT64_MAX]. * - * The purpose of the write_iterator is to split LSNs sequence of - * one key into subsequences, bordered with VLSNs, and then merge - * each subsequence into a one statement. + * The iterator splits a sequence of LSNs for the same key into + * a series of histories, one for each read view, and then merges + * each history into a single statement: + * * -------- - * ONE KEY: + * SAME KEY * -------- - * 0 VLSN1 VLSN2 ... INT64_MAX - * | | | | - * | LSN1 ... LSN(i)|LSN(i+1) ... LSN(j)|LSN(j+1) ... LSN(N)| - * \_______________/\__________________/\___________________/ - * merge merge merge + * 0 VLSN1 VLSN2 ... INT64_MAX + * | | | | + * | LSN1 ... LSN(i) | LSN(i+1) ... LSN(j) | LSN(j+1) ... LSN(N) | + * \________________/ \___________________/ \____________________/ + * merge merge merge * - * A range of optimizations is possible, which allow decrease - * count of source statements and count of merged subsequences. + * The following optimizations are applicable, all aiming at + * purging unnecessary statements from the output. The + * optimizations are applied while reading the statements from + * the heap, from newest LSN to oldest. * * --------------------------------------------------------------- - * Optimization 1: skip DELETE from the last level of the oldest - * read view. + * Optimization #1: when merging the last level of the LSM tree, + * e.g. when doing a major compaction, skip DELETEs from the + * output as long as they are older than the oldest read view: + * * --------------------------- - * ONE KEY, LAST LEVEL SOURCE: + * SAME KEY, MAJOR COMPACTION * --------------------------- - * LSN1 LSN2 ... DELETE LSNi LSNi+1 ... LSN_N - * \___________________________/\___________________________/ + * + * 0 VLSN1 ... INT64_MAX + * | | | + * | LSN1 LSN2 ... DELETE | LSNi LSNi+1 ... LSN_N | + * \___________________________/ \___________________________/ * skip merge * - * Details: if the source is the oldest source of all, then it is - * not necessary to write any DELETE to the disk, including all - * LSNs of the same key, which are older than this DELETE. However - * such a skip is possible only if there is no read views to the - * same key, older than this DELETE, because read views can't be - * skipped. + * Indeed, we don't have to store absent data on disk, including + * the statements even older than the pruned delete. + * As for all other read views, if a DELETE is visible to a read + * view, it has to be preserved. * * --------------------------------------------------------------- - * Optimization 2: on REPLACE/DELETE skip rest of statements, - * until the next read view. + * Optimization #2: once we found a REPLACE or DELETE, we can skip + * the rest of the stream until the next read view: + * * -------- - * ONE KEY: + * SAME KEY * -------- - * ... LSN(k) ... LSN(i-1) LSN(i) REPLACE/DELETE - * Read - * views: * * - * _____________/\_____________________________/\__________/ - * merge skip return - * - * Is is possible, since the REPLACE and DELETE discard all older - * key versions. + * VLSN1 VLSN2 INT64_MAX + * | | | + * | LSN1 LSN2 ... REPLACE | LSNi ... DELETE ... LSN_N | + * \______________/\_______/ \_______/\_________________/ + * skip keep skip merge * * --------------------------------------------------------------- - * Optimization 3: skip statements, which do not update the - * secondary key. + * Optimization #3: when compacting runs of a secondary key, skip + * statements, which do not update this key. * - * Masks intersection: not 0 0 0 not 0 not 0 - * KEY: LSN1 DELETE REPLACE LSN5 ... REPLACE - * \____/\_______________/\__________________/ - * merge skip merge - * - * Details: if UPDATE is executed, it is transformed into - * DELETE + REPLACE or single REPLACE. But in the secondary index - * only keys are stored and if such UPDATE didn't change this key, - * then there is no necessity to write this UPDATE. Actually, - * existance of such UPDATEs is simply ignored. + * -------- + * SAME KEY + * -------- + * VLSN(i) VLSN(i+1) + * Masks | | + * intersection:| not 0 0 0 not 0 not 0 | + * | ANY DELETE REPLACE ANY ... REPLACE | + * \______/\_______________/\___________________/ + * merge skip merge + * + * Details: when UPDATE is executed by Tarantool, it is + * transformed into DELETE + REPLACE or a single REPLACE. But it + * is only necessary to write anything into the secondary key if + * such UPDATE changes any field, which is part of the key. + * All other UPDATEs can be simply skipped. * * --------------------------------------------------------------- - * Optimization 4: use older REPLACE/DELETE as a hints to apply - * newer UPSERTs. + * Optimization #4: use older REPLACE/DELETE to apply UPSERTs and + * convert them into a single REPLACE. When compaction includes + * the last level, absence of REPLACE or DELETE is equivalent + * to a DELETE, and UPSERT can be converted to REPLACE as well. + * If REPLACE or DELETE is found in an older read view, it can + * be used as well. * - * After building history of a key, UPSERT sequences can be - * accumulated in some read views. If the older read views have - * REPLACE/DELETE, they can be used to turn newer UPSERTs into - * REPLACEs. * -------- - * ONE KEY: + * SAME KEY * -------- - * LSN1 LSN2 LSN3 LSN4 - * REPLACE UPSERT UPSERT UPSERT - * Read - * views: * * * * - * ^ \________________________/ - * +- - - - - - - -< apply + * 0 VLSN1 VLSN2 VLSN3 VLSN4 VLSN5 INT64_MAX + * | | | | | | | + * | | REPLACE | UPSERT | UPSERT | UPSERT | ... | + * \_____|___^_____|_________|_________|_________|________/ + * ^ < < apply + * ^ < < apply + * ^ < < apply + * * Result: - * LSN1 LSN2 LSN3 LSN4 - * REPLACE REPLACE REPLACE REPLACE - * Read - * views: * * * * + * + * 0 VLSN1 VLSN2 VLSN3 VLSN4 VLSN5 INT64_MAX + * | | | | | | | + * | | REPLACE | REPLACE | REPLACE | REPLACE | ... | + * \_____|_________|_________|_________|_________|________/ * * See implementation details in * vy_write_iterator_build_read_views. @@ -154,7 +179,7 @@ struct vy_run_env; * @param key_def - key definition for tuple compare. * @param format - dormat to allocate new REPLACE and DELETE tuples from vy_run. * @param upsert_format - same as format, but for UPSERT tuples. - * @param is_primary - set if this iterator is for a primary index. + * @param LSM tree is_primary - set if this iterator is for a primary index. * @param is_last_level - there is no older level than the one we're writing to. * @param read_views - Opened read views. * @return the iterator or NULL on error (diag is set). @@ -165,18 +190,18 @@ vy_write_iterator_new(const struct key_def *key_def, struct tuple_format *format bool is_last_level, struct rlist *read_views); /** - * Add a mem as a source of iterator. - * @return 0 on success or not 0 on error (diag is set). + * Add a mem as a source to the iterator. + * @return 0 on success, -1 on error (diag is set). */ NODISCARD int -vy_write_iterator_add_mem(struct vy_stmt_stream *stream, struct vy_mem *mem); +vy_write_iterator_new_mem(struct vy_stmt_stream *stream, struct vy_mem *mem); /** - * Add a run slice as a source of iterator. - * @return 0 on success or not 0 on error (diag is set). + * Add a run slice as a source to the iterator. + * @return 0 on success, -1 on error (diag is set). */ NODISCARD int -vy_write_iterator_add_slice(struct vy_stmt_stream *stream, +vy_write_iterator_new_slice(struct vy_stmt_stream *stream, struct vy_slice *slice, struct vy_run_env *run_env); #endif /* INCLUDES_TARANTOOL_BOX_VY_WRITE_STREAM_H */ diff --git a/test/unit/vy_write_iterator.c b/test/unit/vy_write_iterator.c index 94c0087260..60d6dbc299 100644 --- a/test/unit/vy_write_iterator.c +++ b/test/unit/vy_write_iterator.c @@ -45,7 +45,7 @@ compare_write_iterator_results(struct key_def *key_def, vy_write_iterator_new(key_def, mem->format, mem->upsert_format, is_primary, is_last_level, &rv_list); fail_if(wi == NULL); - fail_if(vy_write_iterator_add_mem(wi, mem) != 0); + fail_if(vy_write_iterator_new_mem(wi, mem) != 0); struct tuple *ret; fail_if(wi->iface->start(wi) != 0); -- GitLab