flaky test test_large_snapshot (possible bug in proc_raft_snapshot_next_chunk???)

changed title from flake test test_large_snapshot (possible bug in proc_raft_snapshot_next_chunk???) to flaky test test_large_snapshot (possible bug in proc_raft_snapshot_next_chunk???)

added needs-more-info label

mentioned in commit 91a50d32

mentioned in commit e3ff020c

mentioned in commit 43b6a935

https://git.picodata.io/core/picodata/-/jobs/353153

Details

cluster = Cluster("127.0.0.1", n=5)

    def test_large_snapshot(cluster: Cluster):
        i1, i2, i3, i4 = cluster.deploy(instance_count=4)
    
        # TODO: rewrite using clusterwide settings when we implement those
        script_path = f"{cluster.instance_dir}/postjoin.lua"
        with open(script_path, "w") as f:
            f.write(
                """\
                box.cfg { memtx_memory = 4294967296 }
                """
            )
    
        param = dict(
            # Number of tuples to insert
            # N=4 * 1024 * 1024,
            N=4 * 1024 * 1024 / 32,
            # Average size of each tuple (approximate, doesn't include key and meta)
            T=512,
            # Max deviation from the average tuple size
            D=480,
            # Number of unique tuple sizes (resolution of sample space)
            R=32,
        )
    
        def def_prepare_samples(i):
            i.eval(
                """\
                local digest = require 'digest'
                local param = ...
    
                function prepare_samples()
                    local values = {}
                    for i = 0, param.R-1 do
                        local l = param.T + (i * 2 / (param.R - 1) - 1) * param.D;
                        local c = string.char(string.byte('A') + i)
                        values[i] = string.rep(c, l)
                    end
    
                    local samples = {}
                    samples.values = values
                    samples.param = param
                    samples.get = function(self, i)
                        local R = self.param.R
                        local idx = digest.xxhash32(tostring(i)) % R
                        return self.values[idx]
                    end
    
                    return samples
                end
                """,
                param,
            )
    
        def start_preparing_data(i):
            # NOTE: we're inserting into the global space directly and not via CaS
            # because we don't want this test to run for days.
            i.eval(
                """\
                local fiber = require 'fiber'
                fiber.new(function()
                    local log = require 'log'
                    local math = require 'math'
                    local json = require 'json'
    
                    box.cfg { memtx_memory = 4294967296 }
    
                    local samples = prepare_samples()
                    log.info(("inserting tuples into space 'BIG' (param: %s)..."):format(json.encode(samples.param)))
                    local N = samples.param.N
                    local K = math.floor(N / 20);
                    local t0 = fiber.clock()
                    local S = 0
                    local SDt = 0
    
                    box.begin()
                    for i = 0, N-1 do
                        if math.fmod(i+1, K) == 0 then
                            box.commit()
                            log.info(('insertion progress: %d%%'):format( 100 * i / (N-1) ))
                            fiber.yield()
                            box.begin()
                        end
                        local value = samples:get(i)
                        local t = box.space.BIG:put{ i, value }
    
                        local s = t:bsize()
                        S = S + s
                        SDt = SDt + s * s / N
                    end
                    box.commit()
    
                    local E = S / N
                    SD = math.sqrt(SDt - E * E)
    
                    log.info(("done inserting data into space 'BIG', (%f secs, S: %.02fMB, E: %.02fB, SD: %.02fB)")
                        :format(fiber.clock() - t0, S / 1024 / 1024, E, SD))
                end)
            """  # noqa: E501
            )
    
        def wait_data_prepared(i, timeout=20):
            def inner():
                assert i.call("box.space.BIG:count") == param["N"]
    
            Retriable(timeout, 5).call(inner)
    
        cluster.cas("replace", "_pico_property", ["snapshot_chunk_max_size", 1024 * 1024])
        cluster.create_table(
            dict(
                name="BIG",
                format=[
                    dict(name="id", type="unsigned", is_nullable=False),
                    dict(name="value", type="string", is_nullable=False),
                ],
                primary_key=["id"],
                distribution="global",
            )
        )
    
        # This one will be receiving a snapshot
        i4.terminate()
    
        for i in [i1, i2, i3]:
            def_prepare_samples(i)
            start_preparing_data(i)
    
        for i in [i1, i2, i3]:
            wait_data_prepared(i)
    
        index = cluster.cas("insert", "_pico_property", ["big", "data"])
        cluster.raft_wait_index(index)
    
        i1.raft_compact_log()
        i2.raft_compact_log()
        i3.raft_compact_log()
    
        # First i1 is leader and i4 starts reading snapshot from it.
        i1.promote_or_fail()
    
        t_i4 = time.time()
        # Restart the instance triggering the chunky snapshot application.
        i4.env["PICODATA_SCRIPT"] = script_path
        i4.start()
    
        # In the middle of snapshot application propose a new entry
        index = cluster.cas("insert", "_pico_property", ["pokemon", "snap"])
        for i in [i1, i2, i3]:
            i.raft_wait_index(index)
    
        # Add a new instance so that it starts reading the same snapshot
        t_i5 = time.time()
        i5 = cluster.add_instance(wait_online=False)
        i5.env["PICODATA_SCRIPT"] = script_path
        i5.start()
    
        i1.raft_compact_log()
        i2.raft_compact_log()
        i3.raft_compact_log()
    
        # At some point i2 becomes leader but i4 keeps reading snapshot from i1.
        i2.promote_or_fail()
    
>       i4.wait_online(timeout=30 - (time.time() - t_i4))

cluster    = Cluster("127.0.0.1", n=5)
def_prepare_samples = <function test_large_snapshot.<locals>.def_prepare_samples at 0x70977075d940>
f          = <_io.TextIOWrapper name='/tmp/pytest-of-root/pytest-0/popen-gw4/test_large_snapshot0/postjoin.lua' mode='w' encoding='UTF-8'>
i          = Instance(i3, listen=127.0.0.1:4976 cluster=cluster-4-24, process.pid=56132)
i1         = Instance(i1, listen=127.0.0.1:4972 cluster=cluster-4-24, process.pid=56126)
i2         = Instance(i2, listen=127.0.0.1:4974 cluster=cluster-4-24, process.pid=56129)
i3         = Instance(i3, listen=127.0.0.1:4976 cluster=cluster-4-24, process.pid=56132)
i4         = Instance(i4, listen=127.0.0.1:4978 cluster=cluster-4-24, process.pid=56652)
i5         = Instance(i5, listen=127.0.0.1:4980 cluster=cluster-4-24, process.pid=56853)
index      = 47
param      = {'D': 480, 'N': 131072.0, 'R': 32, 'T': 512}
script_path = '/tmp/pytest-of-root/pytest-0/popen-gw4/test_large_snapshot0/postjoin.lua'
start_preparing_data = <function test_large_snapshot.<locals>.start_preparing_data at 0x70977075e160>
t_i4       = 1737433666.44146
t_i5       = 1737433670.671142
wait_data_prepared = <function test_large_snapshot.<locals>.wait_data_prepared at 0x70977075dd00>

test/int/test_snapshot.py:231: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
test/conftest.py:1402: in wait_online
    raise e from e
        deadline   = 6458720.499995327
        expected_incarnation = None
        fetch_current_state = <function Instance.wait_online.<locals>.fetch_current_state at 0x70977075c720>
        incarnation = 1
        last_state = ('Offline', 1)
        next_retry = 6458720.814044109
        now        = 6458720.416948704
        rps        = 5
        self       = Instance(i4, listen=127.0.0.1:4978 cluster=cluster-4-24, process.pid=56652)
        start      = 6458694.715525236
        state      = ('Offline', 1)
        timeout    = 25.759080410003662
        variant    = 'Offline'
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = Instance(i4, listen=127.0.0.1:4978 cluster=cluster-4-24, process.pid=56652)
timeout = 25.759080410003662, rps = 5, expected_incarnation = None

    def wait_online(
        self, timeout: int | float = 30, rps: int | float = 5, expected_incarnation=None
    ):
        """Wait until instance attains Online grade.
    
        This function will periodically check the current instance's grade and
        reset the timeout each time the grade changes.
    
        Args:
            timeout (int | float, default=6): time limit since last grade change
            rps (int | float, default=5): retries per second
    
        Raises:
            AssertionError: if doesn't succeed
        """
    
        if self.process is None:
            raise ProcessDead("process was not started")
    
        def fetch_current_state() -> Tuple[str, int]:
            myself = self.instance_info()
    
            assert isinstance(myself["current_state"], dict)
            return (
                myself["current_state"]["variant"],
                myself["current_state"]["incarnation"],
            )
    
        start = time.monotonic()
        deadline = start + timeout
        next_retry = start
        last_state = None
        while True:
            now = time.monotonic()
            assert now < deadline, "timeout"
    
            # Throttling
            if now < next_retry:
                time.sleep(next_retry - now)
            next_retry = time.monotonic() + 1 / rps
    
            try:
                # Fetch state
                state = fetch_current_state()
                if state != last_state:
                    last_state = state
                    deadline = time.monotonic() + timeout
    
                # Check state
                variant, incarnation = state
>               assert variant == "Online"
E               assert == failed. [pytest-clarity diff shown]
E                 #x1B[0m
E                 #x1B[0m#x1B[32mLHS#x1B[0m vs #x1B[31mRHS#x1B[0m shown below
E                 #x1B[0m
E                 #x1B[0m#x1B[32mO#x1B[0m#x1B[37;42mff#x1B[0m#x1B[32mline#x1B[0m
E                 #x1B[0m#x1B[31mO#x1B[0m#x1B[37;41mn#x1B[0m#x1B[31mline#x1B[0m
E                 #x1B[0m

deadline   = 6458720.499995327
expected_incarnation = None
fetch_current_state = <function Instance.wait_online.<locals>.fetch_current_state at 0x70977075c720>
incarnation = 1
last_state = ('Offline', 1)
next_retry = 6458720.814044109
now        = 6458720.416948704
rps        = 5
self       = Instance(i4, listen=127.0.0.1:4978 cluster=cluster-4-24, process.pid=56652)
start      = 6458694.715525236
state      = ('Offline', 1)
timeout    = 25.759080410003662
variant    = 'Offline'

test/conftest.py:1380: AssertionError

picodata-job-353153.txt

added bug label

@gmoshkin с момента открытия тикета было несколько фиксов в этом районе, правильно понимаю что должно быть исправлено в текущем мастере?

assigned to @gmoshkin

removed needs-more-info label

added domain/snapshot label

changed milestone to %25.2 - LTS

unassigned @gmoshkin

added triaged label

removed triaged label

flaky test test_large_snapshot (possible bug in proc_raft_snapshot_next_chunk???)

Child items ...

Activity