flaky test test_large_snapshot (possible bug in proc_raft_snapshot_next_chunk???)
Странно флакнул тест
https://git.picodata.io/core/picodata/-/jobs/320838
Вот этот ассёрт стрельнул: https://git.picodata.io/core/picodata/-/blob/3f0890e404a53af7fb76cd010777a86d8c015a36/src/traft/node.rs#L1956
FAILED test/int/test_snapshot.py::test_large_snapshot - conftest.ProcessDead: process exited unexpectedly, exit_code=-6
panicked at src/traft/node.rs:1956:17:
assertion `left == right` failed
left: 64692
right: 32346
backtrace:
...
9: picodata::traft::node::NodeImpl::fetch_chunkwise_snapshot
at /builds/core/picodata/src/traft/node.rs:1956:17
10: picodata::traft::node::NodeImpl::prepare_for_snapshot
at /builds/core/picodata/src/traft/node.rs:2081:29
11: picodata::traft::node::NodeImpl::advance
- Show closed items
Activity
-
Newest first Oldest first
-
Show all activity Show comments only Show history only
- Georgy Moshkin changed title from flake test test_large_snapshot (possible bug in proc_raft_snapshot_next_chunk???) to flaky test test_large_snapshot (possible bug in proc_raft_snapshot_next_chunk???)
changed title from flake test test_large_snapshot (possible bug in proc_raft_snapshot_next_chunk???) to flaky test test_large_snapshot (possible bug in proc_raft_snapshot_next_chunk???)
- Georgy Moshkin added needs-more-info label
added needs-more-info label
- Dmitry Rodionov mentioned in commit 91a50d32
mentioned in commit 91a50d32
- Dmitry Rodionov mentioned in commit e3ff020c
mentioned in commit e3ff020c
- Dmitry Rodionov mentioned in commit 43b6a935
mentioned in commit 43b6a935
- Owner
https://git.picodata.io/core/picodata/-/jobs/353153
Details
cluster = Cluster("127.0.0.1", n=5) def test_large_snapshot(cluster: Cluster): i1, i2, i3, i4 = cluster.deploy(instance_count=4) # TODO: rewrite using clusterwide settings when we implement those script_path = f"{cluster.instance_dir}/postjoin.lua" with open(script_path, "w") as f: f.write( """\ box.cfg { memtx_memory = 4294967296 } """ ) param = dict( # Number of tuples to insert # N=4 * 1024 * 1024, N=4 * 1024 * 1024 / 32, # Average size of each tuple (approximate, doesn't include key and meta) T=512, # Max deviation from the average tuple size D=480, # Number of unique tuple sizes (resolution of sample space) R=32, ) def def_prepare_samples(i): i.eval( """\ local digest = require 'digest' local param = ... function prepare_samples() local values = {} for i = 0, param.R-1 do local l = param.T + (i * 2 / (param.R - 1) - 1) * param.D; local c = string.char(string.byte('A') + i) values[i] = string.rep(c, l) end local samples = {} samples.values = values samples.param = param samples.get = function(self, i) local R = self.param.R local idx = digest.xxhash32(tostring(i)) % R return self.values[idx] end return samples end """, param, ) def start_preparing_data(i): # NOTE: we're inserting into the global space directly and not via CaS # because we don't want this test to run for days. i.eval( """\ local fiber = require 'fiber' fiber.new(function() local log = require 'log' local math = require 'math' local json = require 'json' box.cfg { memtx_memory = 4294967296 } local samples = prepare_samples() log.info(("inserting tuples into space 'BIG' (param: %s)..."):format(json.encode(samples.param))) local N = samples.param.N local K = math.floor(N / 20); local t0 = fiber.clock() local S = 0 local SDt = 0 box.begin() for i = 0, N-1 do if math.fmod(i+1, K) == 0 then box.commit() log.info(('insertion progress: %d%%'):format( 100 * i / (N-1) )) fiber.yield() box.begin() end local value = samples:get(i) local t = box.space.BIG:put{ i, value } local s = t:bsize() S = S + s SDt = SDt + s * s / N end box.commit() local E = S / N SD = math.sqrt(SDt - E * E) log.info(("done inserting data into space 'BIG', (%f secs, S: %.02fMB, E: %.02fB, SD: %.02fB)") :format(fiber.clock() - t0, S / 1024 / 1024, E, SD)) end) """ # noqa: E501 ) def wait_data_prepared(i, timeout=20): def inner(): assert i.call("box.space.BIG:count") == param["N"] Retriable(timeout, 5).call(inner) cluster.cas("replace", "_pico_property", ["snapshot_chunk_max_size", 1024 * 1024]) cluster.create_table( dict( name="BIG", format=[ dict(name="id", type="unsigned", is_nullable=False), dict(name="value", type="string", is_nullable=False), ], primary_key=["id"], distribution="global", ) ) # This one will be receiving a snapshot i4.terminate() for i in [i1, i2, i3]: def_prepare_samples(i) start_preparing_data(i) for i in [i1, i2, i3]: wait_data_prepared(i) index = cluster.cas("insert", "_pico_property", ["big", "data"]) cluster.raft_wait_index(index) i1.raft_compact_log() i2.raft_compact_log() i3.raft_compact_log() # First i1 is leader and i4 starts reading snapshot from it. i1.promote_or_fail() t_i4 = time.time() # Restart the instance triggering the chunky snapshot application. i4.env["PICODATA_SCRIPT"] = script_path i4.start() # In the middle of snapshot application propose a new entry index = cluster.cas("insert", "_pico_property", ["pokemon", "snap"]) for i in [i1, i2, i3]: i.raft_wait_index(index) # Add a new instance so that it starts reading the same snapshot t_i5 = time.time() i5 = cluster.add_instance(wait_online=False) i5.env["PICODATA_SCRIPT"] = script_path i5.start() i1.raft_compact_log() i2.raft_compact_log() i3.raft_compact_log() # At some point i2 becomes leader but i4 keeps reading snapshot from i1. i2.promote_or_fail() > i4.wait_online(timeout=30 - (time.time() - t_i4)) cluster = Cluster("127.0.0.1", n=5) def_prepare_samples = <function test_large_snapshot.<locals>.def_prepare_samples at 0x70977075d940> f = <_io.TextIOWrapper name='/tmp/pytest-of-root/pytest-0/popen-gw4/test_large_snapshot0/postjoin.lua' mode='w' encoding='UTF-8'> i = Instance(i3, listen=127.0.0.1:4976 cluster=cluster-4-24, process.pid=56132) i1 = Instance(i1, listen=127.0.0.1:4972 cluster=cluster-4-24, process.pid=56126) i2 = Instance(i2, listen=127.0.0.1:4974 cluster=cluster-4-24, process.pid=56129) i3 = Instance(i3, listen=127.0.0.1:4976 cluster=cluster-4-24, process.pid=56132) i4 = Instance(i4, listen=127.0.0.1:4978 cluster=cluster-4-24, process.pid=56652) i5 = Instance(i5, listen=127.0.0.1:4980 cluster=cluster-4-24, process.pid=56853) index = 47 param = {'D': 480, 'N': 131072.0, 'R': 32, 'T': 512} script_path = '/tmp/pytest-of-root/pytest-0/popen-gw4/test_large_snapshot0/postjoin.lua' start_preparing_data = <function test_large_snapshot.<locals>.start_preparing_data at 0x70977075e160> t_i4 = 1737433666.44146 t_i5 = 1737433670.671142 wait_data_prepared = <function test_large_snapshot.<locals>.wait_data_prepared at 0x70977075dd00> test/int/test_snapshot.py:231: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ test/conftest.py:1402: in wait_online raise e from e deadline = 6458720.499995327 expected_incarnation = None fetch_current_state = <function Instance.wait_online.<locals>.fetch_current_state at 0x70977075c720> incarnation = 1 last_state = ('Offline', 1) next_retry = 6458720.814044109 now = 6458720.416948704 rps = 5 self = Instance(i4, listen=127.0.0.1:4978 cluster=cluster-4-24, process.pid=56652) start = 6458694.715525236 state = ('Offline', 1) timeout = 25.759080410003662 variant = 'Offline' _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = Instance(i4, listen=127.0.0.1:4978 cluster=cluster-4-24, process.pid=56652) timeout = 25.759080410003662, rps = 5, expected_incarnation = None def wait_online( self, timeout: int | float = 30, rps: int | float = 5, expected_incarnation=None ): """Wait until instance attains Online grade. This function will periodically check the current instance's grade and reset the timeout each time the grade changes. Args: timeout (int | float, default=6): time limit since last grade change rps (int | float, default=5): retries per second Raises: AssertionError: if doesn't succeed """ if self.process is None: raise ProcessDead("process was not started") def fetch_current_state() -> Tuple[str, int]: myself = self.instance_info() assert isinstance(myself["current_state"], dict) return ( myself["current_state"]["variant"], myself["current_state"]["incarnation"], ) start = time.monotonic() deadline = start + timeout next_retry = start last_state = None while True: now = time.monotonic() assert now < deadline, "timeout" # Throttling if now < next_retry: time.sleep(next_retry - now) next_retry = time.monotonic() + 1 / rps try: # Fetch state state = fetch_current_state() if state != last_state: last_state = state deadline = time.monotonic() + timeout # Check state variant, incarnation = state > assert variant == "Online" E assert == failed. [pytest-clarity diff shown] E #x1B[0m E #x1B[0m#x1B[32mLHS#x1B[0m vs #x1B[31mRHS#x1B[0m shown below E #x1B[0m E #x1B[0m#x1B[32mO#x1B[0m#x1B[37;42mff#x1B[0m#x1B[32mline#x1B[0m E #x1B[0m#x1B[31mO#x1B[0m#x1B[37;41mn#x1B[0m#x1B[31mline#x1B[0m E #x1B[0m deadline = 6458720.499995327 expected_incarnation = None fetch_current_state = <function Instance.wait_online.<locals>.fetch_current_state at 0x70977075c720> incarnation = 1 last_state = ('Offline', 1) next_retry = 6458720.814044109 now = 6458720.416948704 rps = 5 self = Instance(i4, listen=127.0.0.1:4978 cluster=cluster-4-24, process.pid=56652) start = 6458694.715525236 state = ('Offline', 1) timeout = 25.759080410003662 variant = 'Offline' test/conftest.py:1380: AssertionError
Edited by Georgy Moshkin - Dmitry Rodionov added bug label
added bug label
- Owner
@gmoshkin с момента открытия тикета было несколько фиксов в этом районе, правильно понимаю что должно быть исправлено в текущем мастере?
- Dmitry Rodionov assigned to @gmoshkin
assigned to @gmoshkin
- Dmitry Rodionov removed needs-more-info label
removed needs-more-info label
- Dmitry Rodionov added domain/snapshot label
added domain/snapshot label
- Dmitry Rodionov changed milestone to %25.2 - LTS
changed milestone to %25.2 - LTS
- Dmitry Rodionov unassigned @gmoshkin
unassigned @gmoshkin
- Dmitry Rodionov added triaged label
added triaged label
- Dmitry Rodionov removed triaged label
removed triaged label