FAILED test/int/test_snapshot.py::test_large_snapshot - conftest.ProcessDead: process exited unexpectedly, exit_code=1
Full log here!
_____________________________ test_large_snapshot ______________________________[gw8] linux -- Python 3.10.12 /builds/core/picodata/.venv/bin/pythoncluster = Cluster("127.0.0.1", n=5) def test_large_snapshot(cluster: Cluster): i1, i2, i3, i4 = cluster.deploy(instance_count=4) # TODO: rewrite using clusterwide settings when we implement those script_path = f"{cluster.data_dir}/postjoin.lua" with open(script_path, "w") as f: f.write( """\ box.cfg { memtx_memory = 4294967296 } """ ) param = dict( # Number of tuples to insert # N=4 * 1024 * 1024, N=4 * 1024 * 1024 / 32, # Average size of each tuple (approximate, doesn't include key and meta) T=512, # Max deviation from the average tuple size D=480, # Number of unique tuple sizes (resolution of sample space) R=32, ) def def_prepare_samples(i): i.eval( """\ local digest = require 'digest' local param = ... function prepare_samples() local values = {} for i = 0, param.R-1 do local l = param.T + (i * 2 / (param.R - 1) - 1) * param.D; local c = string.char(string.byte('A') + i) values[i] = string.rep(c, l) end local samples = {} samples.values = values samples.param = param samples.get = function(self, i) local R = self.param.R local idx = digest.xxhash32(tostring(i)) % R return self.values[idx] end return samples end """, param, ) def start_preparing_data(i): # NOTE: we're inserting into the global space directly and not via CaS # because we don't want this test to run for days. i.eval( """\ local fiber = require 'fiber' fiber.new(function() local log = require 'log' local math = require 'math' local json = require 'json' box.cfg { memtx_memory = 4294967296 } local samples = prepare_samples() log.info(("inserting tuples into space 'BIG' (param: %s)..."):format(json.encode(samples.param))) local N = samples.param.N local K = math.floor(N / 20); local t0 = fiber.clock() local S = 0 local SDt = 0 box.begin() for i = 0, N-1 do if math.fmod(i+1, K) == 0 then box.commit() log.info(('insertion progress: %d%%'):format( 100 * i / (N-1) )) fiber.yield() box.begin() end local value = samples:get(i) local t = box.space.BIG:put{ i, value } local s = t:bsize() S = S + s SDt = SDt + s * s / N end box.commit() local E = S / N SD = math.sqrt(SDt - E * E) log.info(("done inserting data into space 'BIG', (%f secs, S: %.02fMB, E: %.02fB, SD: %.02fB)") :format(fiber.clock() - t0, S / 1024 / 1024, E, SD)) end) """ # noqa: E501 ) def wait_data_prepared(i, timeout=20): def inner(): assert i.call("box.space.BIG:count") == param["N"] Retriable(timeout, 5).call(inner) cluster.cas("replace", "_pico_property", ["snapshot_chunk_max_size", 1024 * 1024]) cluster.create_table( dict( name="BIG", format=[ dict(name="id", type="unsigned", is_nullable=False), dict(name="value", type="string", is_nullable=False), ], primary_key=["id"], distribution="global", ) ) # This one will be receiving a snapshot i4.terminate() for i in [i1, i2, i3]: def_prepare_samples(i) start_preparing_data(i) for i in [i1, i2, i3]: wait_data_prepared(i) index = cluster.cas("insert", "_pico_property", ["big", "data"]) cluster.raft_wait_index(index) i1.raft_compact_log() i2.raft_compact_log() i3.raft_compact_log() # First i1 is leader and i4 starts reading snapshot from it. i1.promote_or_fail() t_i4 = time.time() # Restart the instance triggering the chunky snapshot application. i4.env["PICODATA_SCRIPT"] = script_path i4.start() # Wait for i4 to start receiving the snapshot Retriable(10, 60).call( lambda: assert_eq( i4.call(".proc_runtime_info")["internal"]["main_loop_status"], "receiving snapshot", ) ) # In the middle of snapshot application propose a new entry index = cluster.cas("insert", "_pico_property", ["pokemon", "snap"]) for i in [i1, i2, i3]: i.raft_wait_index(index) # Add a new instance so that it starts reading the same snapshot t_i5 = time.time() i5 = cluster.add_instance(wait_online=False) i5.env["PICODATA_SCRIPT"] = script_path i5.start() # Wait for i5 to start receiving the snapshot Retriable(10, 60).call( lambda: assert_eq( i5.call(".proc_runtime_info")["internal"]["main_loop_status"], "receiving snapshot", ) ) i1.raft_compact_log() i2.raft_compact_log() i3.raft_compact_log() # At some point i2 becomes leader but i4 keeps reading snapshot from i1. i2.promote_or_fail() i4.wait_online(timeout=30 - (time.time() - t_i4)) print(f"i4 catching up by snapshot took: {time.time() - t_i4}s")> i5.wait_online(timeout=20 - (time.time() - t_i5))cluster = Cluster("127.0.0.1", n=5)def_prepare_samples = <function test_large_snapshot.<locals>.def_prepare_samples at 0x746ed7169ab0>f = <_io.TextIOWrapper name='/tmp/pytest-of-root/pytest-0/popen-gw8/test_large_snapshot0/postjoin.lua' mode='w' encoding='UTF-8'>i = Instance(i3, listen=127.0.0.1:4932 cluster=cluster-8-10, process.pid=58529)i1 = Instance(i1, listen=127.0.0.1:4930 cluster=cluster-8-10, process.pid=58515)i2 = Instance(i2, listen=127.0.0.1:4931 cluster=cluster-8-10, process.pid=58523)i3 = Instance(i3, listen=127.0.0.1:4932 cluster=cluster-8-10, process.pid=58529)i4 = Instance(i4, listen=127.0.0.1:4933 cluster=cluster-8-10, process.pid=60508)i5 = Instance(i5, listen=127.0.0.1:4934 cluster=cluster-8-10, process.pid=61408)index = 44param = {'D': 480, 'N': 131072.0, 'R': 32, 'T': 512}script_path = '/tmp/pytest-of-root/pytest-0/popen-gw8/test_large_snapshot0/postjoin.lua'start_preparing_data = <function test_large_snapshot.<locals>.start_preparing_data at 0x746ed716ba30>t_i4 = 1732144384.6783023t_i5 = 1732144386.2193658wait_data_prepared = <function test_large_snapshot.<locals>.wait_data_prepared at 0x746ed7169900>test/int/test_snapshot.py:250: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ test/conftest.py:1359: in wait_online raise e from e deadline = 1169430.2624890406 expected_incarnation = None fetch_current_state = <function Instance.wait_online.<locals>.fetch_current_state at 0x746ed62ae440> incarnation = 0 last_state = ('Offline', 0) next_retry = 1169411.614334289 now = 1169411.225011645 rps = 5 self = Instance(i5, listen=127.0.0.1:4934 cluster=cluster-8-10, process.pid=61408) start = 1169411.213780635 state = ('Offline', 0) timeout = 19.038729667663574 variant = 'Offline'test/conftest.py:1344: in wait_online state = fetch_current_state() deadline = 1169430.2624890406 expected_incarnation = None fetch_current_state = <function Instance.wait_online.<locals>.fetch_current_state at 0x746ed62ae440> incarnation = 0 last_state = ('Offline', 0) next_retry = 1169411.614334289 now = 1169411.225011645 rps = 5 self = Instance(i5, listen=127.0.0.1:4934 cluster=cluster-8-10, process.pid=61408) start = 1169411.213780635 state = ('Offline', 0) timeout = 19.038729667663574 variant = 'Offline'test/conftest.py:1321: in fetch_current_state myself = self.instance_info() self = Instance(i5, listen=127.0.0.1:4934 cluster=cluster-8-10, process.pid=61408)test/conftest.py:1287: in instance_info info = self.call(".proc_instance_info", timeout=timeout) self = Instance(i5, listen=127.0.0.1:4934 cluster=cluster-8-10, process.pid=61408) timeout = 10test/conftest.py:715: in call self.check_process_alive() args = () fn = '.proc_instance_info' password = None self = Instance(i5, listen=127.0.0.1:4934 cluster=cluster-8-10, process.pid=61408) timeout = 10 user = None_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = Instance(i5, listen=127.0.0.1:4934 cluster=cluster-8-10, process.pid=61408) def check_process_alive(self): if self.process is None: raise ProcessDead("process was not started") try: # Note: The process may have crashed due to the RPC, but there may # be a race between when the python connector receives the # connection reset error and when the OS will finish cleaning up # the process (especially considering we have a supervisor # process). So we introduce a tiny timeout here (which may still not # be enough in every case). exit_code = self.process.wait(timeout=0.1) # type: ignore except subprocess.TimeoutExpired: # it's fine, the process is still running pass else:> raise ProcessDead(f"process exited unexpectedly, {exit_code=}")E conftest.ProcessDead: process exited unexpectedly, exit_code=1exit_code = 1self = Instance(i5, listen=127.0.0.1:4934 cluster=cluster-8-10, process.pid=61408)test/conftest.py:1272: ProcessDead----------------------------- Captured stdout call -----------------------------