diff --git a/doc/user/language-reference.xml b/doc/user/language-reference.xml
index a85b8f15bf24998d43219ecb5c8ecd0ed4f2dc5e..ba494144849e554dd7ab4dc7e2e9b768097f2105 100644
--- a/doc/user/language-reference.xml
+++ b/doc/user/language-reference.xml
@@ -183,20 +183,23 @@
         Take a snapshot of all data and store it in
         <filename><olink
         targetptr="snap_dir"/>/&lt;latest-lsn&gt;.snap</filename>.
-        To take a snapshot, Tarantool forks and quickly
-        <function>munmap(2)</function>s all memory except the area 
-        where tuples are stored. Since all modern operating systems support
-        virtual memory copy-on-write, this effectively creates a
-        consistent snapshot of all tuples in the child process,
-        which is then written to disk tuple by tuple. Since a
-        snapshot is written sequentially, you can expect a very
+        To take a snapshot, Tarantool first enters the delayed
+        garbage collection mode for all data. In this mode,
+        tuples which were allocated before the snapshot has
+        started are not freed until the snapshot has finished.
+        To preserve consistency of the primary key, used to
+        iterate over tuples, a copy-on-write technique is employed.
+        If the master process changes part of a primary key, 
+        the corresponding process page is split, and the snapshot
+        process obtains an old copy of the page. Since a
+        snapshot is written sequentially, one can expect a very
         high write performance (averaging to 80MB/second on modern
         disks), which means an average database instance gets
         saved in a matter of minutes.  Note, that as long as there
-        are any changes to the parent memory through concurrent
+        are any changes to the parent index memory through concurrent
         updates, there are going to be page splits, and therefore
-        you need to have some extra free memory to run this
-        command. 15%-30% of <olink targetptr="slab_alloc_arena"/>
+        one needs to have some extra free memory to run this
+        command. 10% of <olink targetptr="slab_alloc_arena"/>
         is, on average, sufficient. This statement waits until a
         snapshot is taken and returns operation result. For
         example:
diff --git a/doc/user/preface.xml b/doc/user/preface.xml
index d1878e68b88a4f3eb8a043a1bd91df3e83c27053..b208307440759e28c99f69e432cee1e9345356c3 100644
--- a/doc/user/preface.xml
+++ b/doc/user/preface.xml
@@ -39,11 +39,11 @@
     A simple solution is employed: the server <emphasis
     role="strong">can be requested to save a concise
     snapshot</emphasis> of
-    its current data. The underlying operating system's
-    <quote>copy-on-write</quote> feature is employed to take the
-    snapshot in a quick, resource-savvy and non-blocking manner.
-    The <quote>copy-on-write</quote> technique guarantees that
-    snapshotting has minimal impact on server performance.
+    its current data. A combination of delayed garbage collection
+    for data pages and <quote>copy-on-write</quote> technique for
+    index pages is employed to provide the snapshot process
+    with a consistent read view, so that the snapshot is taken
+    in a quick, resource-savvy and non-blocking manner.
   </para>
 
   <para>
diff --git a/include/qbuf.h b/include/qbuf.h
new file mode 100644
index 0000000000000000000000000000000000000000..3feb743171dad86433c961097fa556b5d036cadd
--- /dev/null
+++ b/include/qbuf.h
@@ -0,0 +1,102 @@
+#ifndef TARANTOOL_QBUF_H_INCLUDED
+#define TARANTOOL_QBUF_H_INCLUDED
+/*
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ *    copyright notice, this list of conditions and the
+ *    following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+
+#define QBUF_WATERMARK (512 * sizeof(void*))
+
+struct qbuf {
+	char *buf;
+	size_t bottom; /* advanced by batch free */
+	size_t top;
+	size_t size;   /* total buffer size */
+};
+
+static inline int
+qbuf_init(struct qbuf *q, size_t size) {
+	q->size = size;
+	q->bottom = 0;
+	q->top = 0;
+	q->buf = (char*)malloc(size);
+	return (q->buf == NULL ? -1 : 0);
+}
+
+static inline void
+qbuf_free(struct qbuf *q) {
+	if (q->buf) {
+		free(q->buf);
+		q->buf = NULL;
+	}
+}
+
+static inline int
+qbuf_n(struct qbuf *q) {
+	return (q->top - q->bottom) / sizeof(void*);
+}
+
+#ifndef unlikely
+# define unlikely __builtin_expect(!! (EXPR), 0)
+#endif
+
+static inline int
+qbuf_push(struct qbuf *q, void *ptr)
+{
+	/* reduce memory allocation and memmove
+	 * effect by reusing free pointers buffer space only after the
+	 * watermark frees reached. */
+	if (unlikely(q->bottom >= QBUF_WATERMARK)) {
+		memmove(q->buf, q->buf + q->bottom, q->bottom);
+		q->top -= q->bottom;
+		q->bottom = 0;
+	}
+	if (unlikely((q->top + sizeof(void*)) > q->size)) {
+		size_t newsize = q->size * 2;
+		char *ptr = (char*)realloc((void*)q->buf, newsize);
+		if (unlikely(ptr == NULL))
+			return -1;
+		q->buf = ptr;
+		q->size = newsize;
+	}
+	memcpy(q->buf + q->top, (char*)&ptr, sizeof(ptr));
+	q->top += sizeof(void*);
+	return 0;
+}
+
+static inline void*
+qbuf_pop(struct qbuf *q) {
+	if (unlikely(q->bottom == q->top))
+		return NULL;
+	void *ret = *(void**)(q->buf + q->bottom);
+	q->bottom += sizeof(void*);
+	return ret;
+}
+
+#endif
diff --git a/include/salloc.h b/include/salloc.h
index 4b7e5e97e427490dcb1a07a1517c59399dd4cef2..773bdd08719d5938ff002774a650a4d050e33efd 100644
--- a/include/salloc.h
+++ b/include/salloc.h
@@ -38,7 +38,9 @@ bool salloc_init(size_t size, size_t minimal, double factor);
 void salloc_free(void);
 void *salloc(size_t size, const char *what);
 void sfree(void *ptr);
+void sfree_delayed(void *ptr);
 void slab_validate();
+void salloc_protect(void);
 
 /** Statistics on utilization of a single slab class. */
 struct slab_cache_stats {
diff --git a/src/box/space.cc b/src/box/space.cc
index d9c255010411abbeb9b31634804c152533b49948..3ee39e0ca830d689371baf277ce7dc96b1b48d9a 100644
--- a/src/box/space.cc
+++ b/src/box/space.cc
@@ -216,7 +216,7 @@ space_free(void)
 				mh_i32ptr_node(spaces, i)->val;
 		space_delete(space);
 	}
-	tuple_free();
+	tuple_format_free();
 }
 
 
@@ -265,7 +265,7 @@ void
 space_init(void)
 {
 	spaces = mh_i32ptr_new();
-	tuple_init();
+	tuple_format_init();
 
 	/* configure regular spaces */
 	space_config();
diff --git a/src/box/tuple.cc b/src/box/tuple.cc
index 0daec173676555e64ebe8618fb1fc20d6ed4bd7b..ce8d5a9e98662f9b6a64d8f5ece37ace68f06f9f 100644
--- a/src/box/tuple.cc
+++ b/src/box/tuple.cc
@@ -203,6 +203,16 @@ tuple_init_field_map(struct tuple *tuple, struct tuple_format *format)
 	}
 }
 
+/**
+ * Incremented on every snapshot and is used to distinguish tuples
+ * which were created after start of a snapshot (these tuples can
+ * be freed right away, since they are not used for snapshot) or
+ * before start of a snapshot (these tuples can be freed only
+ * after the snapshot has finished, otherwise it'll write bad data
+ * to the snapshot file).
+ */
+extern uint32_t snapshot_version;
+
 /** Allocate a tuple */
 struct tuple *
 tuple_alloc(struct tuple_format *format, size_t size)
@@ -212,6 +222,7 @@ tuple_alloc(struct tuple_format *format, size_t size)
 	struct tuple *tuple = (struct tuple *)(ptr + format->field_map_size);
 
 	tuple->refs = 0;
+	tuple->version = snapshot_version;
 	tuple->bsize = size;
 	tuple->format_id = tuple_format_id(format);
 
@@ -229,7 +240,10 @@ tuple_free(struct tuple *tuple)
 	say_debug("tuple_free(%p)", tuple);
 	assert(tuple->refs == 0);
 	char *ptr = (char *) tuple - tuple_format(tuple)->field_map_size;
-	sfree(ptr);
+	if (tuple->version == snapshot_version)
+		sfree(ptr);
+	else
+		sfree_delayed(ptr);
 }
 
 /**
@@ -510,13 +524,13 @@ tuple_compare_with_key(const struct tuple *tuple, const char *key,
 }
 
 void
-tuple_init()
+tuple_format_init()
 {
 	tuple_format_ber = tuple_format_new(NULL, 0);
 }
 
 void
-tuple_free()
+tuple_format_free()
 {
 	for (struct tuple_format **format = tuple_formats;
 	     format < tuple_formats + formats_size;
diff --git a/src/box/tuple.h b/src/box/tuple.h
index bd893244c2040773087d8a61e09f1f2f5e2a796e..9b663cbb01a2aa17ba3fc46fe9428bf720658dff 100644
--- a/src/box/tuple.h
+++ b/src/box/tuple.h
@@ -115,6 +115,8 @@ tuple_format_new(struct key_def *key_def, uint32_t key_count);
  */
 struct tuple
 {
+	/** snapshot generation version */
+	uint32_t version;
 	/** reference counter */
 	uint16_t refs;
 	/** format identifier */
@@ -370,10 +372,10 @@ tuple_to_luabuf(struct tuple *tuple, struct luaL_Buffer *b);
 
 /** Initialize tuple library */
 void
-tuple_init();
+tuple_format_init();
 
 /** Cleanup tuple library */
 void
-tuple_free();
+tuple_format_free();
 #endif /* TARANTOOL_BOX_TUPLE_H_INCLUDED */
 
diff --git a/src/salloc.cc b/src/salloc.cc
index 12257843c6a84cbf39c715e682578dc5f4331e90..4a15de7392d72c27fe8b5a90de901e606c107122 100644
--- a/src/salloc.cc
+++ b/src/salloc.cc
@@ -61,14 +61,16 @@ static const size_t MAX_SLAB_ITEM = 1 << 20;
 size_t MAX_SLAB_ITEM_COUNT;
 
 struct slab_item {
-	struct slab_item *next;
+    SLIST_ENTRY(slab_item) next;
 };
 
+SLIST_HEAD(item_slist_head, slab_item);
+
 struct slab {
 	uint32_t magic;
 	size_t used;
 	size_t items;
-	struct slab_item *free;
+	struct item_slist_head free;
 	struct slab_cache *cache;
 	void *brk;
 	SLIST_ENTRY(slab) link;
@@ -88,7 +90,10 @@ struct slab_cache {
 struct arena {
 	void *mmap_base;
 	size_t mmap_size;
-
+	/** How items tuples do we have stacked for delayed free. */
+	int64_t delayed_free_count;
+	/** How many items in the delayed free list to free at once. */
+	size_t delayed_free_batch;
 	void *base;
 	size_t size;
 	size_t used;
@@ -96,6 +101,11 @@ struct arena {
 };
 
 static uint32_t slab_active_caches;
+/**
+ * Delayed garbage collection for items which are used
+ * in a forked process.
+ */
+static struct item_slist_head free_delayed;
 static struct slab_cache slab_caches[256];
 static struct arena arena;
 
@@ -126,17 +136,22 @@ slab_caches_init(size_t minimal, double factor)
 
 	MAX_SLAB_ITEM_COUNT = (size_t) (SLAB_SIZE - sizeof(struct slab)) /
 			slab_caches[0].item_size;
+
+	SLIST_INIT(&free_delayed);
 }
 
 static bool
 arena_init(struct arena *arena, size_t size)
 {
+	arena->delayed_free_batch = 100;
+	arena->delayed_free_count = 0;
+
 	arena->used = 0;
 	arena->size = size - size % SLAB_SIZE;
 	arena->mmap_size = size - size % SLAB_SIZE + SLAB_SIZE;	/* spend SLAB_SIZE bytes on align :-( */
 
 	arena->mmap_base = mmap(NULL, arena->mmap_size,
-				PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+				PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
 	if (arena->mmap_base == MAP_FAILED) {
 		say_syserror("mmap");
 		return false;
@@ -149,6 +164,16 @@ arena_init(struct arena *arena, size_t size)
 	return true;
 }
 
+/**
+ * Protect slab arena from changes. A safeguard used in a forked
+ * process to prevent changes to the master process arena.
+ */
+void
+salloc_protect(void)
+{
+	mprotect(arena.mmap_base, arena.mmap_size, PROT_READ);
+}
+
 static void *
 arena_alloc(struct arena *arena)
 {
@@ -182,7 +207,6 @@ salloc_free(void)
 {
 	if (arena.mmap_base != NULL)
 		munmap(arena.mmap_base, arena.mmap_size);
-
 	memset(&arena, 0, sizeof(struct arena));
 }
 
@@ -192,7 +216,7 @@ format_slab(struct slab_cache *cache, struct slab *slab)
 	assert(cache->item_size <= MAX_SLAB_ITEM);
 
 	slab->magic = SLAB_MAGIC;
-	slab->free = NULL;
+	SLIST_INIT(&slab->free);
 	slab->cache = cache;
 	slab->items = 0;
 	slab->used = 0;
@@ -269,6 +293,58 @@ valid_item(struct slab *slab, void *item)
 }
 #endif
 
+void
+sfree(void *ptr)
+{
+	struct slab *slab = slab_header(ptr);
+	struct slab_cache *cache = slab->cache;
+	struct slab_item *item = (struct slab_item *) ptr;
+
+	if (fully_formatted(slab) && SLIST_EMPTY(&slab->free))
+		TAILQ_INSERT_TAIL(&cache->free_slabs, slab, cache_free_link);
+
+	assert(valid_item(slab, item));
+	assert(SLIST_EMPTY(&slab->free) || valid_item(slab, SLIST_FIRST(&slab->free)));
+
+	SLIST_INSERT_HEAD(&slab->free, item, next);
+	slab->used -= cache->item_size + sizeof(red_zone);
+	slab->items -= 1;
+
+	if (slab->items == 0) {
+		TAILQ_REMOVE(&cache->free_slabs, slab, cache_free_link);
+		TAILQ_REMOVE(&cache->slabs, slab, cache_link);
+		SLIST_INSERT_HEAD(&arena.free_slabs, slab, free_link);
+	}
+
+	VALGRIND_FREELIKE_BLOCK(item, sizeof(red_zone));
+}
+
+static void
+sfree_batch(void)
+{
+	ssize_t batch = arena.delayed_free_batch;
+
+	while (--batch >= 0 && !SLIST_EMPTY(&free_delayed)) {
+		assert(arena.delayed_free_count > 0);
+		struct slab_item *item = SLIST_FIRST(&free_delayed);
+		SLIST_REMOVE_HEAD(&free_delayed, next);
+		arena.delayed_free_count--;
+		sfree(item);
+	}
+}
+
+void
+sfree_delayed(void *ptr)
+{
+	if (ptr == NULL)
+		return;
+	struct slab_item *item = (struct slab_item *)ptr;
+	struct slab *slab = slab_header(item);
+	assert(valid_item(slab, item));
+	SLIST_INSERT_HEAD(&free_delayed, item, next);
+	arena.delayed_free_count++;
+}
+
 void *
 salloc(size_t size, const char *what)
 {
@@ -276,6 +352,8 @@ salloc(size_t size, const char *what)
 	struct slab *slab;
 	struct slab_item *item;
 
+	sfree_batch();
+
 	if ((cache = cache_for(size)) == NULL ||
 	    (slab = slab_of(cache)) == NULL) {
 
@@ -283,21 +361,20 @@ salloc(size_t size, const char *what)
 			  "slab allocator", what);
 	}
 
-	if (slab->free == NULL) {
+	if (SLIST_EMPTY(&slab->free)) {
 		assert(valid_item(slab, slab->brk));
 		item = (struct slab_item *) slab->brk;
 		memcpy((char *)item + cache->item_size, red_zone, sizeof(red_zone));
 		slab->brk = (char *) slab->brk + cache->item_size + sizeof(red_zone);
 	} else {
-		assert(valid_item(slab, slab->free));
-		item = slab->free;
-
+		item = SLIST_FIRST(&slab->free);
+		assert(valid_item(slab, item));
 		(void) VALGRIND_MAKE_MEM_DEFINED(item, sizeof(void *));
-		slab->free = item->next;
+		SLIST_REMOVE_HEAD(&slab->free, next);
 		(void) VALGRIND_MAKE_MEM_UNDEFINED(item, sizeof(void *));
 	}
 
-	if (fully_formatted(slab) && slab->free == NULL)
+	if (fully_formatted(slab) && SLIST_EMPTY(&slab->free))
 		TAILQ_REMOVE(&cache->free_slabs, slab, cache_free_link);
 
 	slab->used += cache->item_size + sizeof(red_zone);
@@ -307,36 +384,6 @@ salloc(size_t size, const char *what)
 	return (void *)item;
 }
 
-void
-sfree(void *ptr)
-{
-	if (ptr == NULL)
-		return;
-	struct slab *slab = slab_header(ptr);
-	struct slab_cache *cache = slab->cache;
-	struct slab_item *item = (struct slab_item *) ptr;
-
-	if (fully_formatted(slab) && slab->free == NULL)
-		TAILQ_INSERT_TAIL(&cache->free_slabs, slab, cache_free_link);
-
-	assert(valid_item(slab, item));
-	assert(slab->free == NULL || valid_item(slab, slab->free));
-
-	item->next = slab->free;
-	slab->free = item;
-	slab->used -= cache->item_size + sizeof(red_zone);
-	slab->items -= 1;
-
-	if (slab->items == 0) {
-		TAILQ_REMOVE(&cache->free_slabs, slab, cache_free_link);
-		TAILQ_REMOVE(&cache->slabs, slab, cache_link);
-		SLIST_INSERT_HEAD(&arena.free_slabs, slab, free_link);
-	}
-
-	VALGRIND_FREELIKE_BLOCK(item, sizeof(red_zone));
-}
-
-
 size_t
 salloc_ptr_to_index(void *ptr)
 {
diff --git a/src/tarantool.cc b/src/tarantool.cc
index 9bb883a8a67bb0a61aa3557c568d87d5542e59be..4e7777de01fe4736221107752e2b628a5ba9886a 100644
--- a/src/tarantool.cc
+++ b/src/tarantool.cc
@@ -84,6 +84,8 @@ struct tarantool_cfg cfg;
 static ev_signal *sigs = NULL;
 
 int snapshot_pid = 0; /* snapshot processes pid */
+uint32_t snapshot_version = 0;
+
 extern const void *opt_def;
 
 static int
@@ -314,12 +316,21 @@ tarantool_uptime(void)
 	return ev_now() - start_time;
 }
 
+void snapshot_exit(int code, void* arg) {
+	(void)arg;
+	fflush(NULL);
+	_exit(code);
+}
+
 int
 snapshot(void)
 {
 	if (snapshot_pid)
 		return EINPROGRESS;
 
+	/* increment snapshot version */
+	snapshot_version++;
+
 	pid_t p = fork();
 	if (p < 0) {
 		say_syserror("fork");
@@ -332,6 +343,8 @@ snapshot(void)
 		return (WIFSIGNALED(status) ? EINTR : WEXITSTATUS(status));
 	}
 
+	salloc_protect();
+
 	fiber_set_name(fiber, "dumper");
 	set_proc_title("dumper (%" PRIu32 ")", getppid());
 
@@ -340,6 +353,14 @@ snapshot(void)
 	 * parent stdio buffers at exit().
 	 */
 	close_all_xcpt(1, sayfd);
+	/*
+	 * We must avoid double destruction of tuples on exit.
+	 * Since there is no way to remove existing handlers
+	 * registered in the master process, and snapshot_save()
+	 * may call exit(), push a top-level handler which will do
+	 * _exit() for us.
+	 */
+	on_exit(snapshot_exit, NULL);
 	snapshot_save(recovery_state, box_snapshot);
 
 	exit(EXIT_SUCCESS);