From 660c355f95eee9895701bc8f79051d96b216e0ad Mon Sep 17 00:00:00 2001 From: Vladimir Davydov <vdavydov@tarantool.org> Date: Wed, 15 May 2024 15:17:31 +0300 Subject: [PATCH] vinyl: fix use-after-free of LSM tree in scheduler Between picking an LSM tree from a heap and taking a reference to it in vy_task_new() there are a few places where the scheduler may yield: - in vy_worker_pool_get() to start a worker pool; - in vy_task_dump_new() to wait for a memory tree to be unpinned; - in vy_task_compaction_new() to commit an entry to the metadata log after splitting or coalescing a range. If a concurrent fiber drops and deletes the LSM tree in the meanwhile, the scheduler will crash. To avoid that, let's take a reference to the LSM tree. It's quite difficult to write a functional test for it without a bunch of ugly error injections so we rely on fuzzing tests. Closes #9995 NO_DOC=bug fix NO_TEST=fuzzing (cherry picked from commit 1c4605bb9f3741a4e96c388605e6e2196cd61979) --- .../gh-9995-vy-scheduler-crash-fix.md | 4 ++++ src/box/vy_scheduler.c | 22 +++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 changelogs/unreleased/gh-9995-vy-scheduler-crash-fix.md diff --git a/changelogs/unreleased/gh-9995-vy-scheduler-crash-fix.md b/changelogs/unreleased/gh-9995-vy-scheduler-crash-fix.md new file mode 100644 index 0000000000..f8ffbff556 --- /dev/null +++ b/changelogs/unreleased/gh-9995-vy-scheduler-crash-fix.md @@ -0,0 +1,4 @@ +## bugfix/vinyl + +* Fixed a use-after-free bug in the compaction scheduler triggered by a race + with a concurrent DDL operation (gh-9995). diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c index daddff071c..ea9b438bdb 100644 --- a/src/box/vy_scheduler.c +++ b/src/box/vy_scheduler.c @@ -1839,6 +1839,11 @@ vy_scheduler_peek_dump(struct vy_scheduler *scheduler, struct vy_task **ptask) } if (!lsm->is_dumping && lsm->pin_count == 0 && vy_lsm_generation(lsm) == scheduler->dump_generation) { + /* + * The code below may yield. Take a reference to the LSM tree + * to make sure it isn't deleted in the meantime. + */ + vy_lsm_ref(lsm); /* * Dump is in progress and there is an LSM tree that * contains data that must be dumped at the current @@ -1846,13 +1851,17 @@ vy_scheduler_peek_dump(struct vy_scheduler *scheduler, struct vy_task **ptask) */ if (worker == NULL) { worker = vy_worker_pool_get(&scheduler->dump_pool); - if (worker == NULL) + if (worker == NULL) { + vy_lsm_unref(lsm); return 0; /* all workers are busy */ + } } if (vy_task_dump_new(scheduler, worker, lsm, ptask) != 0) { vy_worker_pool_put(worker); + vy_lsm_unref(lsm); return -1; } + vy_lsm_unref(lsm); if (*ptask != NULL) return 0; /* new task */ /* @@ -1909,15 +1918,24 @@ vy_scheduler_peek_compaction(struct vy_scheduler *scheduler, goto no_task; /* nothing to do */ if (vy_lsm_compaction_priority(lsm) <= 1) goto no_task; /* nothing to do */ + /* + * The code below may yield. Take a reference to the LSM tree + * to make sure it isn't deleted in the meantime. + */ + vy_lsm_ref(lsm); if (worker == NULL) { worker = vy_worker_pool_get(&scheduler->compaction_pool); - if (worker == NULL) + if (worker == NULL) { + vy_lsm_unref(lsm); return 0; /* all workers are busy */ + } } if (vy_task_compaction_new(scheduler, worker, lsm, ptask) != 0) { vy_worker_pool_put(worker); + vy_lsm_unref(lsm); return -1; } + vy_lsm_unref(lsm); if (*ptask == NULL) goto retry; /* LSM tree dropped or range split/coalesced */ return 0; /* new task */ -- GitLab