diff --git a/src/box/vinyl.c b/src/box/vinyl.c index bad56ea69af5edb41229031544fc433d45b94e16..1997eaa65dcecc2f151775a66c6d5253883d6505 100644 --- a/src/box/vinyl.c +++ b/src/box/vinyl.c @@ -717,6 +717,17 @@ struct PACKED vy_tuple_info { uint8_t reserved[3]; }; +struct PACKED vy_range_info { + /** Offset and size of range->begin. */ + uint32_t begin_key_offset; + uint32_t begin_key_size; + /** Offset and size of range->end. */ + uint32_t end_key_offset; + uint32_t end_key_size; + /** Offset of the first run in the range */ + uint32_t first_run_offset; +}; + struct vy_run { struct vy_run_info info; /* buffer with struct vy_page_info */ @@ -728,10 +739,10 @@ struct vy_run { struct vy_range { int64_t id; - /** - * Minimal key of the range - */ - struct vy_tuple *min_key; + /** Range lower bound. NULL if range is leftmost. */ + struct vy_tuple *begin; + /** Range upper bound. NULL if range is rightmost. */ + struct vy_tuple *end; struct vy_index *index; uint64_t update_time; uint32_t used; /* sum of mem->used */ @@ -833,8 +844,6 @@ struct vy_index { * For new ranges, use this id as a sequence. */ int64_t range_id_max; - /* The newest range id that was dumped to disk. */ - int64_t last_dump_range_id; uint32_t range_index_version; }; @@ -1272,9 +1281,6 @@ vy_run_size(struct vy_run *run) run->info.min_size; } -static int -vy_index_dump_range_index(struct vy_index *index); - static struct vy_run * vy_run_new() { @@ -1333,8 +1339,11 @@ vy_range_mem_min_lsn(struct vy_range *range) static int vy_range_cmp(struct vy_range *range, void *key, struct key_def *key_def) { - assert(range->min_key != NULL); - return vy_tuple_compare(range->min_key->data, key, key_def); + /* Any key > -inf. */ + if (range->begin == NULL) + return -1; + + return vy_tuple_compare(range->begin->data, key, key_def); } static int @@ -1342,8 +1351,14 @@ vy_range_cmpnode(struct vy_range *n1, struct vy_range *n2, struct key_def *key_d { if (n1 == n2) return 0; - assert(n1->min_key != NULL && n2->min_key != NULL); - return vy_tuple_compare(n1->min_key->data, n2->min_key->data, key_def); + + /* Any key > -inf. */ + if (n1->begin == NULL) + return -1; + if (n2->begin == NULL) + return 1; + + return vy_tuple_compare(n1->begin->data, n2->begin->data, key_def); } static uint64_t @@ -1504,42 +1519,9 @@ vy_range_iterator_next(struct vy_range_iterator *ii) } } -static int -vy_range_init_min_key(struct vy_range *range) -{ - struct vy_index *index = range->index; - const char *min_key = NULL; - - assert(range->min_key == NULL); - assert(range->used == 0); - - /* Find the minimal key if any. */ - for (struct vy_run *run = range->run; run != NULL; run = run->next) { - struct vy_page_info *p = vy_run_page(run, 0); - const char *key = vy_run_min_key(run, p); - if (min_key == NULL || - vy_tuple_compare(key, min_key, index->key_def) < 0) { - min_key = key; - } - } - - /* Create a tuple with the minimal key. */ - if (min_key == NULL) { - range->min_key = vy_tuple_from_key(index, NULL, 0); - } else { - range->min_key = vy_tuple_extract_key_raw(index, min_key); - } - - if (range->min_key == NULL) - return -1; - - return 0; -} - static void vy_index_add_range(struct vy_index *index, struct vy_range *range) { - assert(range->min_key != NULL); vy_range_tree_insert(&index->tree, range); index->range_index_version++; index->range_count++; @@ -1553,6 +1535,239 @@ vy_index_remove_range(struct vy_index *index, struct vy_range *range) index->range_count--; } +/* + * Check if a is left-adjacent to b, i.e. a->end == b->begin. + */ +static bool +vy_range_is_adjacent(struct vy_range *a, struct vy_range *b, + struct key_def *key_def) +{ + if (a->end == NULL || b->begin == NULL) + return false; + return vy_tuple_compare(a->end->data, b->begin->data, key_def) == 0; +} + +/* + * Check if a precedes b, i.e. a->end <= b->begin. + */ +static bool +vy_range_precedes(struct vy_range *a, struct vy_range *b, + struct key_def *key_def) +{ + if (a->end == NULL || b->begin == NULL) + return false; + return vy_tuple_compare(a->end->data, b->begin->data, key_def) <= 0; +} + +/* + * Check if a ends before b, i.e. a->end < b->end. + */ +static bool +vy_range_ends_before(struct vy_range *a, struct vy_range *b, + struct key_def *key_def) +{ + if (b->end == NULL) + return a->end != NULL; + if (a->end == NULL) + return false; + return vy_tuple_compare(a->end->data, b->end->data, key_def) < 0; +} + +/* + * Check if ranges present in an index span a range w/o holes. If they + * do, delete the range, otherwise remove all ranges of the index + * intersecting the range (if any) and insert the range instead. + */ +static void +vy_index_recover_range(struct vy_index *index, struct vy_range *range) +{ + /* + * The algorithm can be briefly outlined by the following steps: + * + * 1. Find the first range in the index having an intersection + * with the given one. If there is no such range, go to step + * 4, otherwise check if the found range can be the first + * spanning range, i.e. if it starts before or at the same + * point as the given range. If it does, proceed to step 2, + * otherwise go to step 3. + * + * 2. Check if there are holes in the span. To do it, iterate + * over all intersecting ranges and check that for each two + * neighbouring ranges the end of the first one equals the + * beginning of the second one. If there is a hole, proceed + * to step 3, otherwise delete the given range and return. + * + * 3. Remove all ranges intersecting the given range from the + * index. + * + * 4. Insert the given range to the index. + */ + struct vy_range *first, *prev, *n; + + first = vy_range_tree_first(&index->tree); + if (first == NULL) { + /* Trivial case - the index tree is empty. */ + goto insert; + } + + /* + * 1. Find the first intersecting range. + */ + if (range->begin == NULL) { + /* + * The given range starts with -inf. + * Check the leftmost. + */ + if (vy_range_precedes(range, first, index->key_def)) { + /* + * The given range precedes the leftmost, + * hence no intersection is possible. + * + * ----range----| |----first----| + */ + goto insert; + } + if (first->begin != NULL) { + /* + * The leftmost range does not span -inf, + * so there cannot be a span, but there is + * an intersection. + * + * -----range-----| + * |------first------| + */ + goto replace; + } + /* + * Possible span. Check for holes. + * + * --------range--------| + * ----first----| + */ + goto check; + } + /* + * The given range starts with a finite key (> -inf). + * Check the predecessor. + */ + struct vy_range_tree_key tree_key = { + .data = range->begin->data, + }; + prev = vy_range_tree_psearch(&index->tree, &tree_key); + if (prev == NULL) { + /* + * There is no predecessor, i.e. no range with + * begin <= range->begin. Check if the first range + * in the index intersects the given range. + */ + if (vy_range_precedes(range, first, index->key_def)) { + /* + * No intersections. The given range is + * going to be leftmost in the index. + * + * |----range----| |---first---| + */ + goto insert; + } + /* + * Neither strict succession nor strict precedence: + * the first range intersects the given one. + * + * |------range------| + * |------first------| + */ + goto replace; + } + /* + * There is a predecessor. Check whether it intersects + * the given range. + */ + if (vy_range_precedes(prev, range, index->key_def)) { + /* + * The predecessor does not intersect the given + * range, hence there is no span. Check if there + * is an intersection with the successor (if any). + */ + n = vy_range_tree_next(&index->tree, prev); + if (n == NULL || vy_range_precedes(range, n, index->key_def)) { + /* + * No successor or the given range + * precedes it, hence no intersections. + * + * |--prev--| |--range--| |--next--| + */ + goto insert; + } else { + /* + * There is an overlap with the successor. + * + * |--prev--| |--range--| + * |-----next-----| + */ + first = n; + goto replace; + } + } else { + /* + * There is an intersection between the given range + * and the predecessor, so there might be a span. + * Check for holes. + * + * |-------prev-------| + * |-------range-------| + */ + first = prev; + } +check: + /* + * 2. Check for holes in the spanning range. + */ + n = first; + prev = NULL; + do { + if (prev != NULL && + !vy_range_is_adjacent(prev, n, index->key_def)) { + /* + * There is a hole in the spanning range. + * + * |---prev---| |---next---| + * |----------range----------| + */ + break; + } + if (!vy_range_ends_before(n, range, index->key_def)) { + /* + * Spanned the whole range w/o holes. + * + * |---next---| + * |----------range----------| + */ + vy_range_delete(range); + return; + } + prev = n; + n = vy_range_tree_next(&index->tree, n); + } while (n != NULL); + /* Fall through. */ +replace: + /* + * 3. Remove intersecting ranges. + */ + n = first; + do { + struct vy_range *next = vy_range_tree_next(&index->tree, n); + vy_index_remove_range(index, n); + vy_range_delete(n); + n = next; + } while (n != NULL && !vy_range_precedes(range, n, index->key_def)); + /* Fall through. */ +insert: + /* + * 4. Insert the given range to the index. + */ + vy_index_add_range(index, range); +} + /* dump tuple to the run page buffers (tuple header and data) */ static int vy_run_dump_tuple(struct vy_tuple *value, struct vy_buf *info_buf, @@ -1839,6 +2054,17 @@ vy_run_write(int fd, struct vy_write_iterator *wi, return -1; } +static int +vy_parse_range_name(const char *name, int64_t *index_lsn, int64_t *range_id) +{ + int n = 0; + + sscanf(name, "%"SCNx64".%"SCNx64".range%n", index_lsn, range_id, &n); + if (name[n] != '\0') + return -1; + return 0; +} + static struct vy_range * vy_range_new(struct vy_index *index, int64_t id) { @@ -1859,6 +2085,8 @@ vy_range_new(struct vy_index *index, int64_t id) } else { range->id = ++index->range_id_max; } + snprintf(range->path, PATH_MAX, "%s/%016"PRIx64".%016"PRIx64".range", + index->path, index->key_def->opts.lsn, range->id); range->mem_count = 1; range->fd = -1; range->index = index; @@ -1895,13 +2123,61 @@ vy_range_read(struct vy_range *range, void *buf, size_t size, off_t offset) return n; } +static int +vy_load_tuple(struct vy_range *range, size_t size, off_t offset, + struct vy_tuple **tuple) +{ + int rc = 0; + char *buf; + + buf = malloc(size); + if (buf == NULL) { + diag_set(OutOfMemory, size, "malloc", "buf"); + return -1; + } + + if (vy_range_read(range, buf, size, offset) < 0 || + (*tuple = vy_tuple_extract_key_raw(range->index, buf)) == NULL) + rc = -1; + + free(buf); + return rc; +} + +static int +vy_load_range_header(struct vy_range *range, off_t *offset) +{ + struct vy_range_info info; + + if (vy_range_read(range, &info, sizeof(info), 0) < 0) + return -1; + + assert(range->begin == NULL); + if (info.begin_key_size != 0 && + vy_load_tuple(range, info.begin_key_size, + info.begin_key_offset, &range->begin) == -1) + return -1; + + assert(range->end == NULL); + if (info.end_key_size != 0 && + vy_load_tuple(range, info.end_key_size, + info.end_key_offset, &range->end) == -1) + return -1; + + *offset = info.first_run_offset; + return 0; +} + static int vy_range_recover(struct vy_range *range) { struct vy_run *run = NULL; - off_t offset = 0; + off_t offset; int rc = -1; + if (vy_load_range_header(range, &offset) == -1) + goto out; + while (true) { struct vy_run_info *h; ssize_t n; @@ -1951,9 +2227,6 @@ vy_range_recover(struct vy_range *range) goto out; } - if (vy_range_init_min_key(range) == -1) - goto out; - rc = 0; /* success */ out: if (run) @@ -1961,25 +2234,18 @@ vy_range_recover(struct vy_range *range) return rc; } -int -vy_range_open(struct vy_index *index, struct vy_range *range, char *path) +static int +vy_range_open(struct vy_range *range) { - snprintf(range->path, PATH_MAX, "%s", path); - int rc = range->fd = open(path, O_RDWR); + int rc = range->fd = open(range->path, O_RDWR); if (unlikely(rc == -1)) { vy_error("index file '%s' open error: %s ", - path, strerror(errno)); + range->path, strerror(errno)); return -1; } rc = vy_range_recover(range); if (unlikely(rc == -1)) return -1; - - /* Attach range to the index and update statistics. */ - vy_index_add_range(index, range); - index->size += vy_range_size(range); - /* schedule range */ - vy_scheduler_add_range(index->env->scheduler, range); return 0; } @@ -2015,8 +2281,10 @@ vy_range_delete(struct vy_range *range) int rcret = 0; - if (range->min_key) - vy_tuple_unref(range->min_key); + if (range->begin) + vy_tuple_unref(range->begin); + if (range->end) + vy_tuple_unref(range->end); vy_range_delete_runs(range); vy_range_delete_mems(range); @@ -2028,6 +2296,42 @@ vy_range_delete(struct vy_range *range) return rcret; } +static int +vy_write_range_header(int fd, struct vy_range *range) +{ + struct vy_range_info info; + off_t offset = sizeof(info); + + memset(&info, 0, sizeof(info)); + + if (range->begin) { + if (vy_pwrite_file(fd, range->begin->data, + range->begin->size, offset) < 0) + return -1; + info.begin_key_offset = offset; + info.begin_key_size = range->begin->size; + offset += range->begin->size; + } + if (range->end) { + if (vy_pwrite_file(fd, range->end->data, + range->end->size, offset) < 0) + return -1; + info.end_key_offset = offset; + info.end_key_size = range->end->size; + offset += range->end->size; + } + + info.first_run_offset = offset; + + if (vy_pwrite_file(fd, &info, sizeof(info), 0) < 0) + return -1; + + if (lseek(fd, offset, SEEK_SET) == -1) + return -1; + + return 0; +} + /* * Append tuples returned by a write iterator to a range file until * split_key is encountered. p_fd is supposed to point to the range file @@ -2059,11 +2363,14 @@ vy_range_write_run(struct vy_range *range, struct vy_write_iterator *wi, fd = mkstemp(path); if (fd < 0) { create_failed: - vy_error("Failed to create temp file '%s': %s", - path, strerror(errno)); + vy_error("Failed to create temp file: %s", + strerror(errno)); goto fail; } created = true; + + if (vy_write_range_header(fd, range) != 0) + goto fail; } /* Append tuples to the range file. */ @@ -2076,8 +2383,6 @@ vy_range_write_run(struct vy_range *range, struct vy_write_iterator *wi, * We've successfully written a run to a new range file. * Commit the range by linking the file to a proper name. */ - snprintf(range->path, PATH_MAX, "%s/%016"PRIx64".range", - index->path, range->id); if (rename(path, range->path) != 0) { vy_error("Failed to link range file '%s': %s", range->path, strerror(errno)); @@ -2109,7 +2414,7 @@ vy_range_write_run(struct vy_range *range, struct vy_write_iterator *wi, * 4/3 * range_size. */ static bool -vy_range_need_split(struct vy_range *range, const char **split_key) +vy_range_need_split(struct vy_range *range, const char **p_split_key) { struct key_def *key_def = range->index->key_def; struct vy_run *run = NULL; @@ -2127,14 +2432,17 @@ vy_range_need_split(struct vy_range *range, const char **split_key) return false; /* Find the median key in the oldest run (approximately). */ - struct vy_page_info *p = vy_run_page(run, run->info.count / 2); - const char *k = vy_run_min_key(run, p); + struct vy_page_info *mid_page = vy_run_page(run, run->info.count / 2); + const char *split_key = vy_run_min_key(run, mid_page); + + struct vy_page_info *first_page = vy_run_page(run, 0); + const char *min_key = vy_run_min_key(run, first_page); /* No point in splitting if a new range is going to be empty. */ - if (vy_tuple_compare(range->min_key->data, k, key_def) == 0) + if (vy_tuple_compare(min_key, split_key, key_def) == 0) return false; - *split_key = k; + *p_split_key = split_key; return true; } @@ -2171,11 +2479,19 @@ vy_range_compact_prepare(struct vy_range *range, parts[i].fd = -1; } - /* Set min keys for the new ranges. */ - vy_tuple_ref(range->min_key); - parts[0].range->min_key = range->min_key; - if (split_key != NULL) - parts[1].range->min_key = split_key; + /* Set begin and end keys for the new ranges. */ + if (range->begin) + vy_tuple_ref(range->begin); + if (range->end) + vy_tuple_ref(range->end); + parts[0].range->begin = range->begin; + if (split_key != NULL) { + vy_tuple_ref(split_key); + parts[0].range->end = split_key; + parts[1].range->begin = split_key; + parts[1].range->end = range->end; + } else + parts[0].range->end = range->end; /* Replace the old range with the new ones. */ vy_index_remove_range(index, range); @@ -2243,6 +2559,7 @@ vy_range_compact_commit(struct vy_range *range, int n_parts, vy_scheduler_add_range(range->index->env->scheduler, r); } index->range_index_version++; + vy_quota_release(index->env->quota, range->used); vy_range_delete(range); } @@ -2426,21 +2743,29 @@ vy_index_create(struct vy_index *index) } index->range_id_max = 0; - index->last_dump_range_id = 0; /* create initial range */ struct vy_range *range = vy_range_new(index, 0); if (unlikely(range == NULL)) return -1; - if (vy_range_init_min_key(range) != 0) { - vy_range_delete(range); - return -1; - } vy_index_add_range(index, range); vy_scheduler_add_range(index->env->scheduler, range); index->size = vy_range_size(range); return 0; } +static int +vy_range_id_cmp(const void *p1, const void *p2) +{ + struct vy_range *r1 = *(struct vy_range **)p1; + struct vy_range *r2 = *(struct vy_range **)p2; + + if (r1->id > r2->id) + return 1; + if (r1->id < r2->id) + return -1; + return 0; +} + /** * A quick intro into Vinyl cosmology and file format * -------------------------------------------------- @@ -2461,7 +2786,7 @@ vy_index_create(struct vy_index *index) * run. The page index of an active run is fully cached in RAM. * * All files of an index have the following name pattern: - * <lsn>.<range_id>.index + * <lsn>.<range_id>.range * and are stored together in the index directory. * * The <lsn> component represents LSN of index creation: it is used @@ -2471,87 +2796,72 @@ vy_index_create(struct vy_index *index) * * <range_id> component represents the id of the range in an * index. The id is a monotonically growing integer, and is - * assigned to a range when it's created. The header file of each - * range contains a full list of range ids of all ranges known to - * the index when this last range file was created. Thus by - * navigating to the latest range and reading its range directory, - * we can find out ids of all remaining ranges of the index and - * open them. + * assigned to a range when it's created. Thus newer ranges will + * have greater ids, and hence by recovering ranges with greater + * ids first and ignoring ranges which are already fully spanned, + * we can restore the whole index to its latest state. */ static int vy_index_open_ex(struct vy_index *index) { - /* - * The main index file name has format <lsn>.<range_id>.index. - * Load the index given its LSN and choose the maximal range_id - * among ranges within the same LSN. - */ - int64_t last_dump_range_id = 0; + struct vy_range **range_array = NULL; + int range_array_capacity = 0, n_ranges = 0; + int rc = -1; + DIR *index_dir; index_dir = opendir(index->path); if (!index_dir) { vy_error("Can't open dir %s", index->path); - return -1; + goto out; } struct dirent *dirent; while ((dirent = readdir(index_dir))) { - if (!strstr(dirent->d_name, ".index")) - continue; int64_t index_lsn; int64_t range_id; - if (sscanf(dirent->d_name, "%"SCNu64".%"SCNx64, - &index_lsn, &range_id) != 2) - continue; - /* - * Find the newest range in the last incarnation - * of this index. - */ - if (index_lsn != index->key_def->opts.lsn) - continue; - if (last_dump_range_id < range_id) - last_dump_range_id = range_id; - } - closedir(index_dir); - - if (last_dump_range_id == 0) - goto out; /* empty index */ - char path[PATH_MAX]; - snprintf(path, PATH_MAX, "%s/%016"PRIu64".%016"PRIx64".index", - index->path, index->key_def->opts.lsn, last_dump_range_id); - int fd = open(path, O_RDWR); - if (fd == -1) { - vy_error("Can't open index file %s: %s", - path, strerror(errno)); - return -1; - } + if (vy_parse_range_name(dirent->d_name, + &index_lsn, &range_id) != 0) + continue; /* unknown file */ + if (index_lsn != index->key_def->opts.lsn) + continue; /* different incarnation */ - int64_t range_id; - int size; - while ((size = read(fd, &range_id, sizeof(range_id))) == - sizeof(range_id)) { struct vy_range *range = vy_range_new(index, range_id); - if (!range) { - vy_error("%s", "Can't alloc range"); + if (!range) + goto out; + if (vy_range_open(range) != 0) { vy_range_delete(range); - return -1; + goto out; } - char range_path[PATH_MAX]; - snprintf(range_path, PATH_MAX, "%s/%016"PRIx64".range", - index->path, range_id); - if (vy_range_open(index, range, range_path)) { - vy_range_delete(range); - return -1; + + if (n_ranges == range_array_capacity) { + int n = range_array_capacity > 0 ? + range_array_capacity * 2 : 1; + void *p = realloc(range_array, n * sizeof(*range_array)); + if (p == NULL) { + diag_set(OutOfMemory, n * sizeof(*range_array), + "realloc", "range_array"); + vy_range_delete(range); + goto out; + } + range_array = p; + range_array_capacity = n; } + range_array[n_ranges++] = range; } + closedir(index_dir); + index_dir = NULL; + + /* + * Always prefer newer ranges (i.e. those that have greater ids) + * over older ones. Only fall back on an older range, if it has + * not been spanned by the time we get to it. The latter can + * only happen if there was an incomplete range split. + */ + qsort(range_array, n_ranges, sizeof(*range_array), vy_range_id_cmp); + for (int i = n_ranges - 1; i >= 0; i--) + vy_index_recover_range(index, range_array[i]); + n_ranges = 0; - close(fd); - if (size != 0) { - vy_error("Corrupted index file %s", path); - return -1; - } - index->last_dump_range_id = last_dump_range_id; -out: if (!index->range_count) { /* * Special case: index has no ranges @@ -2559,17 +2869,26 @@ vy_index_open_ex(struct vy_index *index) */ /* create initial range */ struct vy_range *range = vy_range_new(index, 0); - if (unlikely(range == NULL)) - return -1; - if (vy_range_init_min_key(range) != 0) { - vy_range_delete(range); - return -1; - } + if (range == NULL) + goto out; vy_index_add_range(index, range); - index->size = vy_range_size(range); } - return 0; + /* Update index size and make ranges visible to the scheduler. */ + for (struct vy_range *range = vy_range_tree_first(&index->tree); + range != NULL; range = vy_range_tree_next(&index->tree, range)) { + index->size += vy_range_size(range); + vy_scheduler_add_range(index->env->scheduler, range); + } + + rc = 0; /* success */ +out: + if (index_dir) + closedir(index_dir); + for (int i = 0; i < n_ranges; i++) + vy_range_delete(range_array[i]); + free(range_array); + return rc; } /** @@ -2787,15 +3106,6 @@ vy_task_dump_complete(struct vy_task *task) vy_mem_delete(mem); mem = next; } - - if (range->run_count == 1) { - /* - * The range file was created successfully, - * update the range index on disk. - */ - vy_index_dump_range_index(index); - } - out: vy_scheduler_add_range(index->env->scheduler, range); return 0; @@ -2871,8 +3181,14 @@ vy_task_compact_execute(struct vy_task *task) goto out; for (int i = 0; i < n_parts; i++) { struct vy_range_compact_part *p = &parts[i]; - struct vy_tuple *split_key = i < n_parts - 1 ? - parts[i + 1].range->min_key : NULL; + struct vy_tuple *split_key = parts[i].range->end; + + if (i > 0) + ERROR_INJECT(ERRINJ_VY_RANGE_SPLIT, + {vy_error("Failed to split range %s", + p->range->path); + rc = -1; goto out;}); + rc = vy_range_write_run(p->range, wi, split_key, &parts[i].fd, &p->run, &curr_tuple); if (rc != 0) @@ -2890,26 +3206,14 @@ vy_task_compact_execute(struct vy_task *task) static int vy_task_compact_complete(struct vy_task *task) { - struct vy_index *index = task->index; struct vy_range *range = task->compact.range; struct vy_range_compact_part *parts = task->compact.parts; int n_parts = task->compact.n_parts; - if (task->status != 0) { + if (task->status != 0) vy_range_compact_abort(range, n_parts, parts); - return 0; - } - - vy_range_compact_commit(range, n_parts, parts); - - if (vy_index_dump_range_index(index)) { - /* - * TODO: we should roll back the failed dump first, but it - * requires a redesign of the index change function. - */ - return -1; - } - + else + vy_range_compact_commit(range, n_parts, parts); return 0; } @@ -3533,8 +3837,10 @@ vy_index_gc(struct vy_index *index) { struct mh_i32ptr_t *ranges = NULL; DIR *dir = NULL; - ranges = mh_i32ptr_new(); + ERROR_INJECT(ERRINJ_VY_GC, {errno = EIO; goto error;}); + + ranges = mh_i32ptr_new(); if (ranges == NULL) goto error; /* @@ -3563,6 +3869,9 @@ vy_index_gc(struct vy_index *index) * identified as old, not all files. */ while ((dirent = readdir(dir))) { + int64_t index_lsn; + int64_t range_id; + if (!(strcmp(".", dirent->d_name))) continue; if (!(strcmp("..", dirent->d_name))) @@ -3574,19 +3883,12 @@ vy_index_gc(struct vy_index *index) is_vinyl_file = true; } */ - if (strstr(dirent->d_name, ".index")) { - is_vinyl_file = true; - int64_t lsn = 0; - sscanf(dirent->d_name, "%"SCNx64, &lsn); - if (lsn >= index->key_def->opts.lsn) - continue; - } - if (strstr(dirent->d_name, ".range")) { + if (vy_parse_range_name(dirent->d_name, + &index_lsn, &range_id) == 0) { is_vinyl_file = true; - uint64_t range_id = 0; - sscanf(dirent->d_name, "%"SCNx64, &range_id); mh_int_t range = mh_i32ptr_find(ranges, range_id, NULL); - if (range != mh_end(ranges)) + if (index_lsn == index->key_def->opts.lsn && + range != mh_end(ranges)) continue; } if (!is_vinyl_file) @@ -3600,8 +3902,10 @@ vy_index_gc(struct vy_index *index) error: say_syserror("failed to cleanup index directory %s", index->path); end: - closedir(dir); - mh_i32ptr_delete(ranges); + if (dir != NULL) + closedir(dir); + if (ranges != NULL) + mh_i32ptr_delete(ranges); } void @@ -3942,72 +4246,6 @@ vy_index_conf_create(struct vy_index *conf, struct key_def *key_def) return 0; } -static int -vy_index_dump_range_index(struct vy_index *index) -{ - if (index->range_id_max == index->last_dump_range_id) - return 0; - long int ranges_size = index->range_count * sizeof(int64_t); - int64_t *ranges = (int64_t *)malloc(ranges_size); - if (!ranges) { - vy_error("Can't alloc %li bytes", (long int)ranges_size); - return -1; - } - int range_no = 0; - struct vy_range *range = vy_range_tree_first(&index->tree); - do { - if (!range->run_count) { - continue; /* Skip empty ranges */ - } - ranges[range_no] = range->id; - ++range_no; - } while ((range = vy_range_tree_next(&index->tree, range))); - - if (!range_no) { - /* - * This index is entirely empty, we won't create - * any files on disk. - */ - free(ranges); - return 0; - } - - char path[PATH_MAX]; - snprintf(path, PATH_MAX, "%s/.tmpXXXXXX", index->path); - int fd = mkstemp(path); - if (fd == -1) { - vy_error("Can't create temporary file in %s: %s", - index->path, strerror(errno)); - free(ranges); - return -1; - } - int write_size = sizeof(uint64_t) * range_no; - if (write(fd, ranges, write_size) != write_size) { - free(ranges); - close(fd); - unlink(path); - vy_error("Can't write index file: %s", strerror(errno)); - return -1; - } - free(ranges); - fsync(fd); - close(fd); - - char new_path[PATH_MAX]; - snprintf(new_path, PATH_MAX, "%s/%016"PRIu64".%016"PRIx64".index", - index->path, index->key_def->opts.lsn, - index->range_id_max); - if (link(path, new_path)) { - vy_error("Can't dump index range dict %s: %s", - new_path, strerror(errno)); - unlink(path); - return -1; - } - index->last_dump_range_id = index->range_id_max; - unlink(path); - return 0; -} - /** * Check whether or not an index was created at the * given LSN. @@ -4030,18 +4268,17 @@ vy_index_exists(struct vy_index *index, int64_t lsn) return false; } /* - * Try to find an index file with a number in name + * Try to find a range file with a number in name * equal to the given LSN. */ - char target_name[PATH_MAX]; - snprintf(target_name, PATH_MAX, "%016"PRIu64, lsn); - size_t len = strlen(target_name); struct dirent *dirent; while ((dirent = readdir(dir))) { - if (strstr(dirent->d_name, ".index") && - strncmp(dirent->d_name, target_name, len) == 0) { + int64_t index_lsn; + int64_t range_id; + if (vy_parse_range_name(dirent->d_name, + &index_lsn, &range_id) == 0 && + index_lsn == lsn) break; - } } closedir(dir); return dirent != NULL; diff --git a/src/errinj.h b/src/errinj.h index 4c3619e9e82fd42cc158fcf30b074c5fb7e612eb..149d68a7db10dda30bdab7b345f65d1205bd2fd9 100644 --- a/src/errinj.h +++ b/src/errinj.h @@ -54,6 +54,8 @@ struct errinj { _(ERRINJ_TUPLE_ALLOC, false) \ _(ERRINJ_TUPLE_FIELD, false) \ _(ERRINJ_VY_RANGE_CREATE, false) \ + _(ERRINJ_VY_RANGE_SPLIT, false) \ + _(ERRINJ_VY_GC, false) \ _(ERRINJ_RELAY, false) ENUM0(errinj_enum, ERRINJ_LIST); diff --git a/test/box/errinj.result b/test/box/errinj.result index cbb4c9e264dc831251dfd9a36d204b196d2d5888..691d977f5230498c8dd369755b7525dd34b9ec45 100644 --- a/test/box/errinj.result +++ b/test/box/errinj.result @@ -16,6 +16,10 @@ errinj.info() state: false ERRINJ_RELAY: state: false + ERRINJ_VY_GC: + state: false + ERRINJ_VY_RANGE_SPLIT: + state: false ERRINJ_WAL_WRITE_DISK: state: false ERRINJ_VY_RANGE_CREATE: diff --git a/test/vinyl/gh.result b/test/vinyl/gh.result index 5afc08c982dcc97da9928fe89c4db20dda255876..a799da4bcb2a0264005aa4baab3f55c2d3962280 100644 --- a/test/vinyl/gh.result +++ b/test/vinyl/gh.result @@ -535,3 +535,48 @@ s:select{} s:drop() --- ... +-- +-- gh-1725: vinyl: merge iterator can't merge more than two runs +-- +s0 = box.schema.space.create('tweedledum', {engine = 'vinyl'}) +--- +... +i0 = s0:create_index('primary', { type = 'tree', parts = {1, 'unsigned'}}) +--- +... +-- integer keys +s0:replace{1, 'tuple'} +--- +- [1, 'tuple'] +... +box.snapshot() +--- +- ok +... +s0:replace{2, 'tuple 2'} +--- +- [2, 'tuple 2'] +... +box.snapshot() +--- +- ok +... +s0:insert{3, 'tuple 3'} +--- +- [3, 'tuple 3'] +... +s0.index['primary']:get{1} +--- +- [1, 'tuple'] +... +s0.index['primary']:get{2} +--- +- [2, 'tuple 2'] +... +s0.index['primary']:get{3} +--- +- [3, 'tuple 3'] +... +s0:drop() +--- +... diff --git a/test/vinyl/gh.test.lua b/test/vinyl/gh.test.lua index 284b6757771d24ed81caf1590a9cfa30f158e2fd..577d623b69fc498e01faa3378063fa029053f06b 100644 --- a/test/vinyl/gh.test.lua +++ b/test/vinyl/gh.test.lua @@ -212,3 +212,24 @@ s:select{} s:truncate() s:select{} s:drop() + +-- +-- gh-1725: vinyl: merge iterator can't merge more than two runs +-- + +s0 = box.schema.space.create('tweedledum', {engine = 'vinyl'}) +i0 = s0:create_index('primary', { type = 'tree', parts = {1, 'unsigned'}}) + +-- integer keys +s0:replace{1, 'tuple'} +box.snapshot() +s0:replace{2, 'tuple 2'} +box.snapshot() + +s0:insert{3, 'tuple 3'} + +s0.index['primary']:get{1} +s0.index['primary']:get{2} +s0.index['primary']:get{3} + +s0:drop() diff --git a/test/vinyl/recover.result b/test/vinyl/recover.result new file mode 100644 index 0000000000000000000000000000000000000000..8214143a17c8db91d2f134117dc2f2ed8ceb814e --- /dev/null +++ b/test/vinyl/recover.result @@ -0,0 +1,111 @@ +-- +-- Check that ranges left from old dumps and incomplete splits +-- are ignored during initial recovery +-- The idea behind the test is simple - create several invalid range files, +-- i.e. those left from previous dumps and incomplete splits, then restart +-- the server and check that the content of the space was not corrupted. +-- +-- To make it possible, we need to (1) prevent the garbage collector from +-- removing unused range files and (2) make the split procedure fail after +-- successfully writing the first range. We use error injection to achieve +-- that. +-- +-- The test runs as follows: +-- +-- 1. Disable garbage collection with the aid of error injection. +-- +-- 2. Add a number of tuples to the test space that would make it split. +-- Rewrite them several times with different values so that different +-- generations of ranges on disk would have different contents. +-- +-- 3. Inject error to the split procedure. +-- +-- 4. Rewrite the tuples another couple of rounds. This should trigger +-- split which is going to fail leaving invalid range files with newer +-- ids on the disk. +-- +-- 5. Restart the server and check that the test space content was not +-- corrupted. +-- +-- +test_run = require('test_run').new() +--- +... +errinj = box.error.injection +--- +... +s = box.schema.space.create('test', {engine='vinyl'}) +--- +... +_ = s:create_index('primary') +--- +... +test_run:cmd("setopt delimiter ';'") +--- +- true +... +function gen(i) + local pad_size = 256 + local range_count = 5 + local pad = string.rep('x', pad_size + i) + local n = (range_count + i) * math.floor(box.cfg.vinyl.range_size / pad_size) + for k = 1,n do + s:replace{k, i + k, pad} + end + box.snapshot() +end; +--- +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... +errinj.set("ERRINJ_VY_GC", true) +--- +- ok +... +for i=1,4 do gen(i) end +--- +... +errinj.set("ERRINJ_VY_RANGE_SPLIT", true) +--- +- ok +... +for i=5,6 do gen(i) end +--- +... +test_run:cmd('restart server default') +s = box.space.test +--- +... +test_run:cmd("setopt delimiter ';'") +--- +- true +... +function check(i) + local pad_size = 256 + local range_count = 5 + local n = (range_count + i) * math.floor(box.cfg.vinyl.range_size / pad_size) + local n_corrupted = 0 + for k=1,n do + local v = s:get(k) + if not v or v[2] ~= i + k then + n_corrupted = n_corrupted + 1 + end + end + return n - s:count(), n_corrupted +end; +--- +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... +check(6) +--- +- 0 +- 0 +... +s:drop() +--- +... diff --git a/test/vinyl/recover.test.lua b/test/vinyl/recover.test.lua new file mode 100644 index 0000000000000000000000000000000000000000..b5fd9e721f5a8f710584aee9a37a0ce5b52c2b58 --- /dev/null +++ b/test/vinyl/recover.test.lua @@ -0,0 +1,77 @@ +-- +-- Check that ranges left from old dumps and incomplete splits +-- are ignored during initial recovery +-- The idea behind the test is simple - create several invalid range files, +-- i.e. those left from previous dumps and incomplete splits, then restart +-- the server and check that the content of the space was not corrupted. +-- +-- To make it possible, we need to (1) prevent the garbage collector from +-- removing unused range files and (2) make the split procedure fail after +-- successfully writing the first range. We use error injection to achieve +-- that. +-- +-- The test runs as follows: +-- +-- 1. Disable garbage collection with the aid of error injection. +-- +-- 2. Add a number of tuples to the test space that would make it split. +-- Rewrite them several times with different values so that different +-- generations of ranges on disk would have different contents. +-- +-- 3. Inject error to the split procedure. +-- +-- 4. Rewrite the tuples another couple of rounds. This should trigger +-- split which is going to fail leaving invalid range files with newer +-- ids on the disk. +-- +-- 5. Restart the server and check that the test space content was not +-- corrupted. +-- +-- +test_run = require('test_run').new() +errinj = box.error.injection + +s = box.schema.space.create('test', {engine='vinyl'}) +_ = s:create_index('primary') + +test_run:cmd("setopt delimiter ';'") +function gen(i) + local pad_size = 256 + local range_count = 5 + local pad = string.rep('x', pad_size + i) + local n = (range_count + i) * math.floor(box.cfg.vinyl.range_size / pad_size) + for k = 1,n do + s:replace{k, i + k, pad} + end + box.snapshot() +end; +test_run:cmd("setopt delimiter ''"); + +errinj.set("ERRINJ_VY_GC", true) +for i=1,4 do gen(i) end +errinj.set("ERRINJ_VY_RANGE_SPLIT", true) +for i=5,6 do gen(i) end + +test_run:cmd('restart server default') + +s = box.space.test + +test_run:cmd("setopt delimiter ';'") +function check(i) + local pad_size = 256 + local range_count = 5 + local n = (range_count + i) * math.floor(box.cfg.vinyl.range_size / pad_size) + local n_corrupted = 0 + for k=1,n do + local v = s:get(k) + if not v or v[2] ~= i + k then + n_corrupted = n_corrupted + 1 + end + end + return n - s:count(), n_corrupted +end; +test_run:cmd("setopt delimiter ''"); + +check(6) + +s:drop() diff --git a/test/vinyl/suite.ini b/test/vinyl/suite.ini index 8e88de34f9ad73596df46957c34df26f4afc043a..7d474163883f55ad56a322e19656f2a268bff043 100644 --- a/test/vinyl/suite.ini +++ b/test/vinyl/suite.ini @@ -4,7 +4,7 @@ description = vinyl integration tests script = vinyl.lua disabled = split.test.lua write_iterator_rand.test.lua valgrind_disabled = -release_disabled = errinj.test.lua +release_disabled = errinj.test.lua recover.test.lua config = suite.cfg lua_libs = suite.lua stress.lua large.lua txn_proxy.lua ../box/lua/utils.lua use_unix_sockets = True