diff --git a/doc/user/language-reference.xml b/doc/user/language-reference.xml index a85b8f15bf24998d43219ecb5c8ecd0ed4f2dc5e..ba494144849e554dd7ab4dc7e2e9b768097f2105 100644 --- a/doc/user/language-reference.xml +++ b/doc/user/language-reference.xml @@ -183,20 +183,23 @@ Take a snapshot of all data and store it in <filename><olink targetptr="snap_dir"/>/<latest-lsn>.snap</filename>. - To take a snapshot, Tarantool forks and quickly - <function>munmap(2)</function>s all memory except the area - where tuples are stored. Since all modern operating systems support - virtual memory copy-on-write, this effectively creates a - consistent snapshot of all tuples in the child process, - which is then written to disk tuple by tuple. Since a - snapshot is written sequentially, you can expect a very + To take a snapshot, Tarantool first enters the delayed + garbage collection mode for all data. In this mode, + tuples which were allocated before the snapshot has + started are not freed until the snapshot has finished. + To preserve consistency of the primary key, used to + iterate over tuples, a copy-on-write technique is employed. + If the master process changes part of a primary key, + the corresponding process page is split, and the snapshot + process obtains an old copy of the page. Since a + snapshot is written sequentially, one can expect a very high write performance (averaging to 80MB/second on modern disks), which means an average database instance gets saved in a matter of minutes. Note, that as long as there - are any changes to the parent memory through concurrent + are any changes to the parent index memory through concurrent updates, there are going to be page splits, and therefore - you need to have some extra free memory to run this - command. 15%-30% of <olink targetptr="slab_alloc_arena"/> + one needs to have some extra free memory to run this + command. 10% of <olink targetptr="slab_alloc_arena"/> is, on average, sufficient. This statement waits until a snapshot is taken and returns operation result. For example: diff --git a/doc/user/preface.xml b/doc/user/preface.xml index d1878e68b88a4f3eb8a043a1bd91df3e83c27053..b208307440759e28c99f69e432cee1e9345356c3 100644 --- a/doc/user/preface.xml +++ b/doc/user/preface.xml @@ -39,11 +39,11 @@ A simple solution is employed: the server <emphasis role="strong">can be requested to save a concise snapshot</emphasis> of - its current data. The underlying operating system's - <quote>copy-on-write</quote> feature is employed to take the - snapshot in a quick, resource-savvy and non-blocking manner. - The <quote>copy-on-write</quote> technique guarantees that - snapshotting has minimal impact on server performance. + its current data. A combination of delayed garbage collection + for data pages and <quote>copy-on-write</quote> technique for + index pages is employed to provide the snapshot process + with a consistent read view, so that the snapshot is taken + in a quick, resource-savvy and non-blocking manner. </para> <para> diff --git a/include/qbuf.h b/include/qbuf.h new file mode 100644 index 0000000000000000000000000000000000000000..3feb743171dad86433c961097fa556b5d036cadd --- /dev/null +++ b/include/qbuf.h @@ -0,0 +1,102 @@ +#ifndef TARANTOOL_QBUF_H_INCLUDED +#define TARANTOOL_QBUF_H_INCLUDED +/* + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <stdlib.h> + +#define QBUF_WATERMARK (512 * sizeof(void*)) + +struct qbuf { + char *buf; + size_t bottom; /* advanced by batch free */ + size_t top; + size_t size; /* total buffer size */ +}; + +static inline int +qbuf_init(struct qbuf *q, size_t size) { + q->size = size; + q->bottom = 0; + q->top = 0; + q->buf = (char*)malloc(size); + return (q->buf == NULL ? -1 : 0); +} + +static inline void +qbuf_free(struct qbuf *q) { + if (q->buf) { + free(q->buf); + q->buf = NULL; + } +} + +static inline int +qbuf_n(struct qbuf *q) { + return (q->top - q->bottom) / sizeof(void*); +} + +#ifndef unlikely +# define unlikely __builtin_expect(!! (EXPR), 0) +#endif + +static inline int +qbuf_push(struct qbuf *q, void *ptr) +{ + /* reduce memory allocation and memmove + * effect by reusing free pointers buffer space only after the + * watermark frees reached. */ + if (unlikely(q->bottom >= QBUF_WATERMARK)) { + memmove(q->buf, q->buf + q->bottom, q->bottom); + q->top -= q->bottom; + q->bottom = 0; + } + if (unlikely((q->top + sizeof(void*)) > q->size)) { + size_t newsize = q->size * 2; + char *ptr = (char*)realloc((void*)q->buf, newsize); + if (unlikely(ptr == NULL)) + return -1; + q->buf = ptr; + q->size = newsize; + } + memcpy(q->buf + q->top, (char*)&ptr, sizeof(ptr)); + q->top += sizeof(void*); + return 0; +} + +static inline void* +qbuf_pop(struct qbuf *q) { + if (unlikely(q->bottom == q->top)) + return NULL; + void *ret = *(void**)(q->buf + q->bottom); + q->bottom += sizeof(void*); + return ret; +} + +#endif diff --git a/include/salloc.h b/include/salloc.h index 4b7e5e97e427490dcb1a07a1517c59399dd4cef2..773bdd08719d5938ff002774a650a4d050e33efd 100644 --- a/include/salloc.h +++ b/include/salloc.h @@ -38,7 +38,9 @@ bool salloc_init(size_t size, size_t minimal, double factor); void salloc_free(void); void *salloc(size_t size, const char *what); void sfree(void *ptr); +void sfree_delayed(void *ptr); void slab_validate(); +void salloc_protect(void); /** Statistics on utilization of a single slab class. */ struct slab_cache_stats { diff --git a/src/box/space.cc b/src/box/space.cc index d9c255010411abbeb9b31634804c152533b49948..3ee39e0ca830d689371baf277ce7dc96b1b48d9a 100644 --- a/src/box/space.cc +++ b/src/box/space.cc @@ -216,7 +216,7 @@ space_free(void) mh_i32ptr_node(spaces, i)->val; space_delete(space); } - tuple_free(); + tuple_format_free(); } @@ -265,7 +265,7 @@ void space_init(void) { spaces = mh_i32ptr_new(); - tuple_init(); + tuple_format_init(); /* configure regular spaces */ space_config(); diff --git a/src/box/tuple.cc b/src/box/tuple.cc index 0daec173676555e64ebe8618fb1fc20d6ed4bd7b..ce8d5a9e98662f9b6a64d8f5ece37ace68f06f9f 100644 --- a/src/box/tuple.cc +++ b/src/box/tuple.cc @@ -203,6 +203,16 @@ tuple_init_field_map(struct tuple *tuple, struct tuple_format *format) } } +/** + * Incremented on every snapshot and is used to distinguish tuples + * which were created after start of a snapshot (these tuples can + * be freed right away, since they are not used for snapshot) or + * before start of a snapshot (these tuples can be freed only + * after the snapshot has finished, otherwise it'll write bad data + * to the snapshot file). + */ +extern uint32_t snapshot_version; + /** Allocate a tuple */ struct tuple * tuple_alloc(struct tuple_format *format, size_t size) @@ -212,6 +222,7 @@ tuple_alloc(struct tuple_format *format, size_t size) struct tuple *tuple = (struct tuple *)(ptr + format->field_map_size); tuple->refs = 0; + tuple->version = snapshot_version; tuple->bsize = size; tuple->format_id = tuple_format_id(format); @@ -229,7 +240,10 @@ tuple_free(struct tuple *tuple) say_debug("tuple_free(%p)", tuple); assert(tuple->refs == 0); char *ptr = (char *) tuple - tuple_format(tuple)->field_map_size; - sfree(ptr); + if (tuple->version == snapshot_version) + sfree(ptr); + else + sfree_delayed(ptr); } /** @@ -510,13 +524,13 @@ tuple_compare_with_key(const struct tuple *tuple, const char *key, } void -tuple_init() +tuple_format_init() { tuple_format_ber = tuple_format_new(NULL, 0); } void -tuple_free() +tuple_format_free() { for (struct tuple_format **format = tuple_formats; format < tuple_formats + formats_size; diff --git a/src/box/tuple.h b/src/box/tuple.h index bd893244c2040773087d8a61e09f1f2f5e2a796e..9b663cbb01a2aa17ba3fc46fe9428bf720658dff 100644 --- a/src/box/tuple.h +++ b/src/box/tuple.h @@ -115,6 +115,8 @@ tuple_format_new(struct key_def *key_def, uint32_t key_count); */ struct tuple { + /** snapshot generation version */ + uint32_t version; /** reference counter */ uint16_t refs; /** format identifier */ @@ -370,10 +372,10 @@ tuple_to_luabuf(struct tuple *tuple, struct luaL_Buffer *b); /** Initialize tuple library */ void -tuple_init(); +tuple_format_init(); /** Cleanup tuple library */ void -tuple_free(); +tuple_format_free(); #endif /* TARANTOOL_BOX_TUPLE_H_INCLUDED */ diff --git a/src/salloc.cc b/src/salloc.cc index 12257843c6a84cbf39c715e682578dc5f4331e90..4a15de7392d72c27fe8b5a90de901e606c107122 100644 --- a/src/salloc.cc +++ b/src/salloc.cc @@ -61,14 +61,16 @@ static const size_t MAX_SLAB_ITEM = 1 << 20; size_t MAX_SLAB_ITEM_COUNT; struct slab_item { - struct slab_item *next; + SLIST_ENTRY(slab_item) next; }; +SLIST_HEAD(item_slist_head, slab_item); + struct slab { uint32_t magic; size_t used; size_t items; - struct slab_item *free; + struct item_slist_head free; struct slab_cache *cache; void *brk; SLIST_ENTRY(slab) link; @@ -88,7 +90,10 @@ struct slab_cache { struct arena { void *mmap_base; size_t mmap_size; - + /** How items tuples do we have stacked for delayed free. */ + int64_t delayed_free_count; + /** How many items in the delayed free list to free at once. */ + size_t delayed_free_batch; void *base; size_t size; size_t used; @@ -96,6 +101,11 @@ struct arena { }; static uint32_t slab_active_caches; +/** + * Delayed garbage collection for items which are used + * in a forked process. + */ +static struct item_slist_head free_delayed; static struct slab_cache slab_caches[256]; static struct arena arena; @@ -126,17 +136,22 @@ slab_caches_init(size_t minimal, double factor) MAX_SLAB_ITEM_COUNT = (size_t) (SLAB_SIZE - sizeof(struct slab)) / slab_caches[0].item_size; + + SLIST_INIT(&free_delayed); } static bool arena_init(struct arena *arena, size_t size) { + arena->delayed_free_batch = 100; + arena->delayed_free_count = 0; + arena->used = 0; arena->size = size - size % SLAB_SIZE; arena->mmap_size = size - size % SLAB_SIZE + SLAB_SIZE; /* spend SLAB_SIZE bytes on align :-( */ arena->mmap_base = mmap(NULL, arena->mmap_size, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (arena->mmap_base == MAP_FAILED) { say_syserror("mmap"); return false; @@ -149,6 +164,16 @@ arena_init(struct arena *arena, size_t size) return true; } +/** + * Protect slab arena from changes. A safeguard used in a forked + * process to prevent changes to the master process arena. + */ +void +salloc_protect(void) +{ + mprotect(arena.mmap_base, arena.mmap_size, PROT_READ); +} + static void * arena_alloc(struct arena *arena) { @@ -182,7 +207,6 @@ salloc_free(void) { if (arena.mmap_base != NULL) munmap(arena.mmap_base, arena.mmap_size); - memset(&arena, 0, sizeof(struct arena)); } @@ -192,7 +216,7 @@ format_slab(struct slab_cache *cache, struct slab *slab) assert(cache->item_size <= MAX_SLAB_ITEM); slab->magic = SLAB_MAGIC; - slab->free = NULL; + SLIST_INIT(&slab->free); slab->cache = cache; slab->items = 0; slab->used = 0; @@ -269,6 +293,58 @@ valid_item(struct slab *slab, void *item) } #endif +void +sfree(void *ptr) +{ + struct slab *slab = slab_header(ptr); + struct slab_cache *cache = slab->cache; + struct slab_item *item = (struct slab_item *) ptr; + + if (fully_formatted(slab) && SLIST_EMPTY(&slab->free)) + TAILQ_INSERT_TAIL(&cache->free_slabs, slab, cache_free_link); + + assert(valid_item(slab, item)); + assert(SLIST_EMPTY(&slab->free) || valid_item(slab, SLIST_FIRST(&slab->free))); + + SLIST_INSERT_HEAD(&slab->free, item, next); + slab->used -= cache->item_size + sizeof(red_zone); + slab->items -= 1; + + if (slab->items == 0) { + TAILQ_REMOVE(&cache->free_slabs, slab, cache_free_link); + TAILQ_REMOVE(&cache->slabs, slab, cache_link); + SLIST_INSERT_HEAD(&arena.free_slabs, slab, free_link); + } + + VALGRIND_FREELIKE_BLOCK(item, sizeof(red_zone)); +} + +static void +sfree_batch(void) +{ + ssize_t batch = arena.delayed_free_batch; + + while (--batch >= 0 && !SLIST_EMPTY(&free_delayed)) { + assert(arena.delayed_free_count > 0); + struct slab_item *item = SLIST_FIRST(&free_delayed); + SLIST_REMOVE_HEAD(&free_delayed, next); + arena.delayed_free_count--; + sfree(item); + } +} + +void +sfree_delayed(void *ptr) +{ + if (ptr == NULL) + return; + struct slab_item *item = (struct slab_item *)ptr; + struct slab *slab = slab_header(item); + assert(valid_item(slab, item)); + SLIST_INSERT_HEAD(&free_delayed, item, next); + arena.delayed_free_count++; +} + void * salloc(size_t size, const char *what) { @@ -276,6 +352,8 @@ salloc(size_t size, const char *what) struct slab *slab; struct slab_item *item; + sfree_batch(); + if ((cache = cache_for(size)) == NULL || (slab = slab_of(cache)) == NULL) { @@ -283,21 +361,20 @@ salloc(size_t size, const char *what) "slab allocator", what); } - if (slab->free == NULL) { + if (SLIST_EMPTY(&slab->free)) { assert(valid_item(slab, slab->brk)); item = (struct slab_item *) slab->brk; memcpy((char *)item + cache->item_size, red_zone, sizeof(red_zone)); slab->brk = (char *) slab->brk + cache->item_size + sizeof(red_zone); } else { - assert(valid_item(slab, slab->free)); - item = slab->free; - + item = SLIST_FIRST(&slab->free); + assert(valid_item(slab, item)); (void) VALGRIND_MAKE_MEM_DEFINED(item, sizeof(void *)); - slab->free = item->next; + SLIST_REMOVE_HEAD(&slab->free, next); (void) VALGRIND_MAKE_MEM_UNDEFINED(item, sizeof(void *)); } - if (fully_formatted(slab) && slab->free == NULL) + if (fully_formatted(slab) && SLIST_EMPTY(&slab->free)) TAILQ_REMOVE(&cache->free_slabs, slab, cache_free_link); slab->used += cache->item_size + sizeof(red_zone); @@ -307,36 +384,6 @@ salloc(size_t size, const char *what) return (void *)item; } -void -sfree(void *ptr) -{ - if (ptr == NULL) - return; - struct slab *slab = slab_header(ptr); - struct slab_cache *cache = slab->cache; - struct slab_item *item = (struct slab_item *) ptr; - - if (fully_formatted(slab) && slab->free == NULL) - TAILQ_INSERT_TAIL(&cache->free_slabs, slab, cache_free_link); - - assert(valid_item(slab, item)); - assert(slab->free == NULL || valid_item(slab, slab->free)); - - item->next = slab->free; - slab->free = item; - slab->used -= cache->item_size + sizeof(red_zone); - slab->items -= 1; - - if (slab->items == 0) { - TAILQ_REMOVE(&cache->free_slabs, slab, cache_free_link); - TAILQ_REMOVE(&cache->slabs, slab, cache_link); - SLIST_INSERT_HEAD(&arena.free_slabs, slab, free_link); - } - - VALGRIND_FREELIKE_BLOCK(item, sizeof(red_zone)); -} - - size_t salloc_ptr_to_index(void *ptr) { diff --git a/src/tarantool.cc b/src/tarantool.cc index 9bb883a8a67bb0a61aa3557c568d87d5542e59be..4e7777de01fe4736221107752e2b628a5ba9886a 100644 --- a/src/tarantool.cc +++ b/src/tarantool.cc @@ -84,6 +84,8 @@ struct tarantool_cfg cfg; static ev_signal *sigs = NULL; int snapshot_pid = 0; /* snapshot processes pid */ +uint32_t snapshot_version = 0; + extern const void *opt_def; static int @@ -314,12 +316,21 @@ tarantool_uptime(void) return ev_now() - start_time; } +void snapshot_exit(int code, void* arg) { + (void)arg; + fflush(NULL); + _exit(code); +} + int snapshot(void) { if (snapshot_pid) return EINPROGRESS; + /* increment snapshot version */ + snapshot_version++; + pid_t p = fork(); if (p < 0) { say_syserror("fork"); @@ -332,6 +343,8 @@ snapshot(void) return (WIFSIGNALED(status) ? EINTR : WEXITSTATUS(status)); } + salloc_protect(); + fiber_set_name(fiber, "dumper"); set_proc_title("dumper (%" PRIu32 ")", getppid()); @@ -340,6 +353,14 @@ snapshot(void) * parent stdio buffers at exit(). */ close_all_xcpt(1, sayfd); + /* + * We must avoid double destruction of tuples on exit. + * Since there is no way to remove existing handlers + * registered in the master process, and snapshot_save() + * may call exit(), push a top-level handler which will do + * _exit() for us. + */ + on_exit(snapshot_exit, NULL); snapshot_save(recovery_state, box_snapshot); exit(EXIT_SUCCESS);