From 957a145114f33451dba56ca2cc859cb0e0b1f778 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Fri, 5 Jun 2026 13:27:00 -0700 Subject: [PATCH 01/13] Add ledger arena for warp sync Signed-off-by: Justin Frevert --- Cargo.lock | 3 + Cargo.toml | 1 + ledger/src/lib.rs | 103 ++++++ ledger/src/versions/common/mod.rs | 77 +++++ node/Cargo.toml | 4 + node/src/lib.rs | 1 + node/src/service.rs | 87 +++++- node/src/warp_ledger_sync/block_import.rs | 80 +++++ node/src/warp_ledger_sync/client.rs | 181 +++++++++++ .../src/warp_ledger_sync/integration_tests.rs | 83 +++++ node/src/warp_ledger_sync/mod.rs | 85 +++++ node/src/warp_ledger_sync/monitor.rs | 120 +++++++ node/src/warp_ledger_sync/oracle.rs | 147 +++++++++ node/src/warp_ledger_sync/protocol.rs | 293 ++++++++++++++++++ node/src/warp_ledger_sync/server.rs | 170 ++++++++++ 15 files changed, 1432 insertions(+), 3 deletions(-) create mode 100644 node/src/warp_ledger_sync/block_import.rs create mode 100644 node/src/warp_ledger_sync/client.rs create mode 100644 node/src/warp_ledger_sync/integration_tests.rs create mode 100644 node/src/warp_ledger_sync/mod.rs create mode 100644 node/src/warp_ledger_sync/monitor.rs create mode 100644 node/src/warp_ledger_sync/oracle.rs create mode 100644 node/src/warp_ledger_sync/protocol.rs create mode 100644 node/src/warp_ledger_sync/server.rs diff --git a/Cargo.lock b/Cargo.lock index a87d3c84f..da8a6ba6f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7761,6 +7761,7 @@ dependencies = [ name = "midnight-node" version = "2.0.0" dependencies = [ + "async-channel 1.9.0", "async-trait", "authority-selection-inherents", "blake2b_simd", @@ -7828,6 +7829,7 @@ dependencies = [ "sc-executor", "sc-keystore", "sc-network", + "sc-network-sync", "sc-offchain", "sc-partner-chains-consensus-aura", "sc-rpc", @@ -7851,6 +7853,7 @@ dependencies = [ "sp-api", "sp-block-builder", "sp-blockchain", + "sp-consensus", "sp-consensus-aura", "sp-consensus-beefy", "sp-consensus-grandpa", diff --git a/Cargo.toml b/Cargo.toml index dfe459d54..36bd40ba4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -234,6 +234,7 @@ sp-runtime-interface = { default-features = false, git = "https://github.com/par sc-chain-spec = { default-features = false, git = "https://github.com/paritytech/polkadot-sdk.git", tag = "polkadot-stable2603" } sc-consensus = { default-features = false, git = "https://github.com/paritytech/polkadot-sdk.git", tag = "polkadot-stable2603" } sc-network = { default-features = false, git = "https://github.com/paritytech/polkadot-sdk.git", tag = "polkadot-stable2603" } +sc-network-sync = { default-features = false, git = "https://github.com/paritytech/polkadot-sdk.git", tag = "polkadot-stable2603" } sc-executor = { default-features = false, git = "https://github.com/paritytech/polkadot-sdk.git", tag = "polkadot-stable2603" } sc-consensus-grandpa = { default-features = false, git = "https://github.com/paritytech/polkadot-sdk.git", tag = "polkadot-stable2603" } sc-consensus-aura = { default-features = false, git = "https://github.com/paritytech/polkadot-sdk.git", tag = "polkadot-stable2603" } diff --git a/ledger/src/lib.rs b/ledger/src/lib.rs index f964102d5..be2acb455 100644 --- a/ledger/src/lib.rs +++ b/ledger/src/lib.rs @@ -136,6 +136,109 @@ pub fn drop_all_default_storage() { ledger_9::storage::drop_default_storage_if_exists(); } +#[cfg(feature = "std")] +/// Serialize the ledger arena snapshot at `state_key` into the canonical, `Ledger`-rooted warp +/// transfer blob (trustless warp ledger-sync, M1.2 server side). +/// +/// `unified` selects the ParityDb instantiation, matching the operator's `storage_separation` +/// config: the two modes register `default_storage` under different `D` type ids (separate = column +/// offset 0; unified = offset `NUM_COLUMNS_POLKADOT`, sharing substrate's parity-db). The blob bytes +/// are identical across modes. +/// +/// Uses the latest ledger version (`ledger_9`): warp-sync targets are near the chain tip, where the +/// active ledger version is the latest. (Assumption — deferred: a node warp-syncing to a block +/// governed by an *older* ledger version would need per-version dispatch here; not reachable today +/// since warp always targets the tip.) +pub fn serialize_ledger_snapshot( + unified: bool, + state_key: &[u8], +) -> Result, ledger_9::api::LedgerApiError> { + type Sig = ledger_9::TransactionSignature; + type DbSeparate = ledger_9::ledger_storage_local::db::ParityDb; + type DbUnified = ledger_9::ledger_storage_local::db::ParityDb< + sha2::Sha256, + ledger_9::ledger_storage_local::db::paritydb::OwnedDb, + { midnight_primitives_ledger::LedgerStorageExt::COLUMN_OFFSET }, + >; + + if unified { + ledger_9::Bridge::::serialize_ledger_snapshot(state_key) + } else { + ledger_9::Bridge::::serialize_ledger_snapshot(state_key) + } +} + +/// Failure modes of [`import_verified_ledger_snapshot`]. All are non-fatal to the chain: the caller +/// discards the data, reports the peer, and retries from another (warp spec M4.1). +#[cfg(feature = "std")] +#[derive(Debug)] +pub enum SnapshotImportError { + /// The on-chain `StateKey` bytes failed to decode to a `TypedArenaKey` (the inner + /// `LedgerApiError` is version-specific, so it is rendered to a string here). + StateKeyDecode(String), + /// The transferred blob failed the arena's native (multi-pass, untrusted-safe) deserialization + /// — malformed, truncated, or internally inconsistent node graph. + Deserialize(std::io::Error), + /// The blob deserialized cleanly but its recomputed root key does **not** equal the on-chain + /// `StateKey`: the peer served a different (or tampered) ledger. **Never persisted.** + RootMismatch, +} + +#[cfg(feature = "std")] +impl core::fmt::Display for SnapshotImportError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + SnapshotImportError::StateKeyDecode(e) => { + write!(f, "failed to decode on-chain StateKey: {e}") + }, + SnapshotImportError::Deserialize(e) => { + write!(f, "failed to deserialize ledger snapshot: {e}") + }, + SnapshotImportError::RootMismatch => { + write!(f, "ledger snapshot root key does not match on-chain StateKey") + }, + } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SnapshotImportError {} + +#[cfg(feature = "std")] +/// Verify a `Ledger`-rooted warp snapshot `blob` against the on-chain `expected_state_key` and, on +/// success, persist it into the already-open arena backend so `get_lazy(StateKey)` resolves (warp +/// ledger-sync M1.3 verification + M1.4 import). `unified` selects the DB instantiation, as in +/// [`serialize_ledger_snapshot`]. Uses the latest ledger version (`ledger_9`) — same near-tip +/// assumption noted there. +/// +/// The caller must hold the authoring/import gate (the arena is single-writer) — see +/// `warp-ledger-sync-m1.4a-spike.md`. +pub fn import_verified_ledger_snapshot( + unified: bool, + blob: &[u8], + expected_state_key: &[u8], +) -> Result<(), SnapshotImportError> { + type Sig = ledger_9::TransactionSignature; + type DbSeparate = ledger_9::ledger_storage_local::db::ParityDb; + type DbUnified = ledger_9::ledger_storage_local::db::ParityDb< + sha2::Sha256, + ledger_9::ledger_storage_local::db::paritydb::OwnedDb, + { midnight_primitives_ledger::LedgerStorageExt::COLUMN_OFFSET }, + >; + + if unified { + ledger_9::Bridge::::import_verified_ledger_snapshot( + blob, + expected_state_key, + ) + } else { + ledger_9::Bridge::::import_verified_ledger_snapshot( + blob, + expected_state_key, + ) + } +} + mod common; pub mod types { diff --git a/ledger/src/versions/common/mod.rs b/ledger/src/versions/common/mod.rs index 7c7edcedc..cae6b1f64 100644 --- a/ledger/src/versions/common/mod.rs +++ b/ledger/src/versions/common/mod.rs @@ -729,6 +729,83 @@ where api.serialize(&ledger_state.as_typed_key()) } + /// Serialize the full ledger arena snapshot at `state_key` into the canonical, `Ledger`-rooted + /// transfer blob used by trustless warp ledger-sync: `derived_tag_prefix ‖ + /// TopoSortedNodes(Ledger DAG)`. + /// + /// Mirrors the single-pass technique of the toolkit's `serialize_ledger_state_fast`, but roots + /// at `Ledger` (the `Sp` from `get_ledger` is an `Sp`) rather than `LedgerState` — see + /// `warp-ledger-sync-spec.md` ODD-1. Because the blob is rooted at `Ledger`, its recomputed + /// content-address root key equals the on-chain `pallet_midnight::StateKey`, which is exactly + /// what the client verifies against (M1.3). The tag prefix is **derived** + /// (`GLOBAL_TAG ‖ ::tag()`), never hardcoded (spec §8 format-lockstep). + pub fn serialize_ledger_snapshot(state_key: &[u8]) -> Result, LedgerApiError> { + use ledger_storage_local::arena::TopoSortedNodes; + use midnight_serialize_local::{GLOBAL_TAG, Serializable}; + use types::SerializationError; + + let api = api::new(); + let ledger = Self::get_ledger(&api, state_key)?; + + // One `serialize_to_node_list()` pass (the derived `Serializable` impl would do two — once + // for `serialized_size`, once for `serialize` — each a full topo-sort of a multi-million + // node DAG), written directly. Byte-identical to the default impl's output. + let nodes: TopoSortedNodes = ledger.serialize_to_node_list(); + let tag_prefix = format!("{}{}:", GLOBAL_TAG, as Tagged>::tag()); + let mut bytes = Vec::with_capacity(tag_prefix.len() + nodes.serialized_size()); + bytes.extend_from_slice(tag_prefix.as_bytes()); + nodes.serialize(&mut bytes).map_err(|e| { + log::error!(target: LOG_TARGET, "Failed to serialize ledger snapshot: {e:?}"); + LedgerApiError::Serialization(SerializationError::LedgerState) + })?; + Ok(bytes) + } + + /// Import a verified, `Ledger`-rooted warp snapshot `blob` into the already-open arena backend, + /// binding it to the trie anchor `expected_state_key` (the on-chain `pallet_midnight::StateKey` + /// the warp-recovered trie already holds). + /// + /// Reconstruction uses the arena's **native multi-pass deserializer** + /// (`Arena::deserialize_sp`, designed for untrusted wire input — it re-hashes every node), then + /// asserts the reconstructed root key equals `expected_state_key` before persisting. So a + /// malicious or faulty peer can at worst cause a rejected import (→ peer report + retry by the + /// caller), never state corruption. + /// + /// Persists + flushes into the live `default_storage` so `get_lazy(StateKey)` resolves. The + /// lifecycle (in-process, no restart, same `alloc`/`persist`/`flush` path live block execution + /// uses) was validated in `warp-ledger-sync-m1.4a-spike.md`. The caller (warp client driver, + /// M1.3) MUST hold the authoring/import gate so no block executes against the arena concurrently + /// — the arena is single-writer. + pub fn import_verified_ledger_snapshot( + blob: &[u8], + expected_state_key: &[u8], + ) -> Result<(), crate::SnapshotImportError> { + use crate::SnapshotImportError; + + let api = api::new(); + let expected: TypedArenaKey, D::Hasher> = api + .tagged_deserialize(expected_state_key) + .map_err(|e| SnapshotImportError::StateKeyDecode(format!("{e:?}")))?; + + // Native verifying (untrusted-safe) deserialize of the `Ledger`-rooted blob into the live + // arena; re-allocating the loaded value yields the persistable `Sp`. + let ledger: Ledger = + helpers_local::deserialize(blob).map_err(SnapshotImportError::Deserialize)?; + let mut sp = default_storage::().arena.alloc(ledger); + + // Cryptographic bind to the trie anchor: the reconstructed root must equal the on-chain + // `StateKey`. This is the whole security argument — reject anything else. + let computed: TypedArenaKey, D::Hasher> = sp.as_typed_key(); + if computed != expected { + return Err(SnapshotImportError::RootMismatch); + } + + sp.persist(); + default_storage::().with_backend(|backend| backend.flush_all_changes_to_db()); + log::info!(target: LOG_TARGET, "Imported verified ledger snapshot ({} bytes)", blob.len()); + Ok(()) + } + pub fn get_unclaimed_amount( state_key: &[u8], beneficiary: &[u8], diff --git a/node/Cargo.toml b/node/Cargo.toml index 0f9a52fa0..f3e48e1ec 100644 --- a/node/Cargo.toml +++ b/node/Cargo.toml @@ -25,6 +25,8 @@ serde_json = { workspace = true, features = ["preserve_order"] } serde.workspace = true async-trait.workspace = true futures.workspace = true +# Must match the version sc-network's `request_response_config` inbound queue expects (1.9). +async-channel = "1.9" sc-cli.workspace = true sc-chain-spec.workspace = true @@ -48,6 +50,8 @@ sc-consensus-slots.workspace = true sc-offchain.workspace = true sc-client-api.workspace = true sc-network.workspace = true +sc-network-sync.workspace = true +sp-consensus.workspace = true sc-utils.workspace = true sp-consensus-aura.workspace = true sp-consensus-beefy.workspace = true diff --git a/node/src/lib.rs b/node/src/lib.rs index d628c5f96..f263cfcff 100644 --- a/node/src/lib.rs +++ b/node/src/lib.rs @@ -38,3 +38,4 @@ pub mod service; pub mod sidechain_params_cmd; pub mod subscription_bounds; mod util; +pub mod warp_ledger_sync; diff --git a/node/src/service.rs b/node/src/service.rs index fc4a59c04..f989627b9 100644 --- a/node/src/service.rs +++ b/node/src/service.rs @@ -242,6 +242,9 @@ type MidnightService = sc_service::PartialComponents< sc_consensus_beefy::BeefyRPCLinks, Option, DataSources, + // Shared warp ledger-sync recovery gate (gates block import + authoring until the arena is + // recovered). Created in `new_partial` so it can wrap the import queue's block import. + Arc, ), >; @@ -320,9 +323,16 @@ pub fn new_partial( .build_storage() .map_err(sp_blockchain::Error::Storage)?; + // Commit the genesis trie state only when NOT warp/fast syncing. With `--sync warp`, + // `no_genesis()` is true, so we skip the commit — leaving `finalized_state == None` so substrate + // will actually engage warp sync (it refuses warp on a DB that already has a finalized state). + // This mirrors stock substrate (`sc_service::builder`'s `!config.no_genesis()`); hardcoding + // `true` here previously meant warp sync silently fell back to full sync. Full-sync behavior is + // unchanged (`no_genesis()` is false → commit). The ledger arena genesis init in `open_paritydb` + // is independent and still runs, so `default_storage` is set for post-warp recovery. let genesis_block_builder = GenesisBlockBuilder::::new( genesis_storage, - true, + !config.no_genesis(), backend.clone(), executor.clone(), genesis_extrinsics?, @@ -413,6 +423,15 @@ pub fn new_partial( let time_source = Arc::new(SystemTimeSource); let inherent_config = CreateInherentDataConfig::new(epoch_config, sc_slot_config, time_source); + // Warp ledger-sync recovery gate, shared by the import queue (below), the authoring oracle, and + // the recovery monitor (both in `new_full`). Wrapping the import queue's block import here holds + // post-warp block imports until the arena is recovered + verified. + let recovery_gate = crate::warp_ledger_sync::oracle::RecoveryGate::new(); + let gated_block_import = crate::warp_ledger_sync::block_import::GatedBlockImport::new( + grandpa_block_import.clone(), + recovery_gate.clone(), + ); + let import_queue = partner_chains_aura_import_queue::import_queue::< AuraPair, _, @@ -422,7 +441,7 @@ pub fn new_partial( _, McHashInherentDigest, >(ImportQueueParams { - block_import: grandpa_block_import.clone(), + block_import: gated_block_import, justification_import: Some(Box::new(grandpa_block_import.clone())), client: client.clone(), create_inherent_data_providers: VerifierCIDP::new( @@ -456,6 +475,7 @@ pub fn new_partial( beefy_rpc_links, telemetry, data_sources, + recovery_gate, ), }; @@ -477,6 +497,9 @@ pub async fn new_full Result<(TaskManager, Arc), ServiceError> { let database_source = config.database.clone(); + // Captured before `storage_config` is moved into `new_partial`: selects the ParityDb layout the + // warp ledger-sync server/importer dispatch to. + let warp_ledger_unified = matches!(storage_config.separation, StorageSeparation::Unified); let new_partial_components = new_partial(&config, epoch_config.clone(), midnight_cfg, storage_config, tx_filter_config)?; @@ -496,6 +519,7 @@ pub async fn new_full::new::< + Network, + >( + genesis_hash, + config.chain_spec.fork_id(), + client.clone(), + warp_ledger_unified, + config.network.default_peers_set_num_full as usize, + ); + net_config.add_request_response_protocol(ledger_sync_cfg); + let warp_sync = Arc::new(sc_consensus_grandpa::warp_proof::NetworkProvider::new( backend.clone(), grandpa_link.shared_authority_set().clone(), @@ -565,6 +610,37 @@ pub async fn new_full( + client.clone(), + sync_service.clone(), + network.clone(), + warp_ledger_recovery_gate.clone(), + ledger_sync_protocol_name.clone(), + warp_ledger_unified, + ), + ); + if config.offchain_worker.enabled { task_manager.spawn_handle().spawn( "offchain-workers-runner", @@ -742,7 +818,12 @@ pub async fn new_full { + inner: Inner, + gate: Arc, +} + +impl GatedBlockImport { + pub fn new(inner: Inner, gate: Arc) -> Self { + Self { inner, gate } + } +} + +#[async_trait::async_trait] +impl BlockImport for GatedBlockImport +where + B: BlockT, + Inner: BlockImport + Send + Sync, +{ + type Error = Inner::Error; + + async fn check_block(&self, block: BlockCheckParams) -> Result { + self.inner.check_block(block).await + } + + async fn import_block(&self, block: BlockImportParams) -> Result { + if self.gate.ledger_recovery_in_progress() { + log::debug!( + target: LOG_TARGET, + "Holding block import until the warp-recovered ledger arena is verified" + ); + while self.gate.ledger_recovery_in_progress() { + tokio::time::sleep(POLL_INTERVAL).await; + } + log::debug!(target: LOG_TARGET, "Ledger arena verified; resuming block import"); + } + self.inner.import_block(block).await + } +} diff --git a/node/src/warp_ledger_sync/client.rs b/node/src/warp_ledger_sync/client.rs new file mode 100644 index 000000000..8d495c5ce --- /dev/null +++ b/node/src/warp_ledger_sync/client.rs @@ -0,0 +1,181 @@ +// This file is part of midnight-node. +// Copyright (C) Midnight Foundation +// SPDX-License-Identifier: Apache-2.0 +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! M1.3 — Ledger-sync client driver + verification. +//! +//! After warp + state-sync complete (target block N captured by the monitor, M2.1), this drives the +//! client side of the protocol: read the on-chain `StateKey` at N (from the warp-recovered trie), +//! fetch the `Ledger`-rooted arena blob in byte ranges from peers, then hand the assembled blob to +//! [`midnight_node_ledger::import_verified_ledger_snapshot`], which verifies its root against the +//! `StateKey` and persists it on success. +//! +//! Verification + persistence live in the ledger crate (next to the arena); this module is pure +//! network orchestration. No peer is trusted: a bad blob fails the root check and is discarded. + +use std::{marker::PhantomData, sync::Arc}; + +use parity_scale_codec::{Decode, Encode}; +use sc_client_api::{Backend, StorageProvider}; +use sc_network::{ + IfDisconnected, NetworkRequest, PeerId, ProtocolName, request_responses::RequestFailure, +}; +use sp_runtime::traits::Block as BlockT; + +use super::{ + LOG_TARGET, + protocol::{ChunkAssembler, LedgerSyncRequest, LedgerSyncResponse, MAX_LEDGER_SYNC_CHUNK}, + read_state_key, +}; + +/// Drives ledger-arena recovery against peers over the ledger-sync request/response protocol. +/// +/// `Network` is `?Sized` so the node's `Arc` handle (which has `NetworkRequest` +/// as a supertrait) can be passed directly. +pub struct LedgerSyncClient { + client: Arc, + network: Arc, + protocol_name: ProtocolName, + /// Whether the local arena uses the unified ParityDb layout (forwarded to the importer). + unified: bool, + _phantom: PhantomData<(B, BE)>, +} + +impl LedgerSyncClient +where + B: BlockT, + BE: Backend + 'static, + Client: StorageProvider + Send + Sync + 'static, + Network: NetworkRequest + Send + Sync + ?Sized + 'static, +{ + pub fn new( + client: Arc, + network: Arc, + protocol_name: ProtocolName, + unified: bool, + ) -> Self { + Self { client, network, protocol_name, unified, _phantom: PhantomData } + } + + /// Recover, verify, and import the ledger arena at `target` (the captured warp target N) by + /// trying `peers` in order. Returns `Ok` as soon as one peer yields a complete blob that + /// verifies against the on-chain `StateKey` and imports; otherwise [`ClientError::AllPeersFailed`]. + /// + /// The caller must hold the authoring/import gate while this runs (single-writer arena, M2.2). + pub async fn recover(&self, target: B::Hash, peers: &[PeerId]) -> Result<(), ClientError> { + let state_key = read_state_key::(&self.client, target)? + .ok_or(ClientError::NoStateKey)?; + + if peers.is_empty() { + return Err(ClientError::NoPeers); + } + + for &peer in peers { + let blob = match self.fetch_blob_from(peer, target).await { + Ok(blob) => blob, + Err(e) => { + log::debug!(target: LOG_TARGET, "ledger fetch from {peer} failed: {e}; trying next peer"); + continue; + }, + }; + + // Verification happens inside the importer (root must equal `state_key`); a verify + // failure means the peer served bad data — discard and try the next one. + // M4.1: a reputation report on the peer belongs here. + match midnight_node_ledger::import_verified_ledger_snapshot( + self.unified, + &blob, + &state_key, + ) { + Ok(()) => { + log::info!( + target: LOG_TARGET, + "Recovered + verified ledger arena at {target:?} from {peer} ({} bytes)", + blob.len() + ); + return Ok(()); + }, + Err(e) => { + log::warn!(target: LOG_TARGET, "ledger import from {peer} failed: {e}; trying next peer"); + }, + } + } + + Err(ClientError::AllPeersFailed) + } + + /// Fetch the full blob from a single peer by paging contiguous byte ranges in order. + /// + /// (Parallel / multi-peer range fetch is a permitted optimization — spec ODD-3 — deferred; the + /// `ChunkAssembler` already supports resume by `next_offset`.) + async fn fetch_blob_from(&self, peer: PeerId, target: B::Hash) -> Result, ClientError> { + // First range establishes `total_len`. + let first = self.request_range(peer, target, 0).await?; + let mut assembler = ChunkAssembler::new(first.total_len); + assembler.accept(first.offset, &first.bytes)?; + + while !assembler.is_complete() { + let next = self.request_range(peer, target, assembler.next_offset()).await?; + if next.bytes.is_empty() { + // Server returned an empty range before completion: treat as a truncated transfer. + // `into_blob` below will surface `Incomplete`. + break; + } + assembler.accept(next.offset, &next.bytes)?; + } + + Ok(assembler.into_blob()?) + } + + async fn request_range( + &self, + peer: PeerId, + target: B::Hash, + offset: u64, + ) -> Result { + let request = + LedgerSyncRequest { target_hash: target, offset, max_len: MAX_LEDGER_SYNC_CHUNK }; + let (bytes, _protocol) = self + .network + .request( + peer, + self.protocol_name.clone(), + request.encode(), + None, + IfDisconnected::ImmediateError, + ) + .await?; + Ok(LedgerSyncResponse::decode(&mut &bytes[..])?) + } +} + +/// Failure modes of [`LedgerSyncClient::recover`]. All are non-fatal: the monitor leaves the +/// authoring gate closed and retries. +#[derive(Debug, thiserror::Error)] +pub enum ClientError { + #[error("no StateKey present at the target block")] + NoStateKey, + #[error("no peers available to recover the ledger from")] + NoPeers, + #[error("blockchain error: {0}")] + Client(#[from] sp_blockchain::Error), + #[error("network request failed: {0:?}")] + Request(#[from] RequestFailure), + #[error("failed to decode response: {0}")] + Decode(#[from] parity_scale_codec::Error), + #[error("chunk assembly failed: {0}")] + Assemble(#[from] super::protocol::AssembleError), + #[error("all peers failed to provide a verifiable snapshot")] + AllPeersFailed, +} diff --git a/node/src/warp_ledger_sync/integration_tests.rs b/node/src/warp_ledger_sync/integration_tests.rs new file mode 100644 index 000000000..e1ed88ccb --- /dev/null +++ b/node/src/warp_ledger_sync/integration_tests.rs @@ -0,0 +1,83 @@ +// This file is part of midnight-node. +// Copyright (C) Midnight Foundation +// SPDX-License-Identifier: Apache-2.0 +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! In-process round-trip test for the warp ledger-sync core (M1), with no networking: +//! init a real arena from genesis → serialize the `Ledger`-rooted snapshot (server, M1.2) → page it +//! through the transport chunker + reassembler (M1.1) → import + verify against the on-chain +//! `StateKey` (client/import, M1.3/M1.4). Also asserts the security property: a tampered blob is +//! rejected (`RootMismatch`), never imported. +//! +//! Run isolated (it touches the process-global `default_storage` singleton): +//! `cargo test -p midnight-node ledger_snapshot_roundtrip`. + +use midnight_node_res::networks::{MidnightNetwork, UndeployedNetwork}; + +use super::protocol::{ChunkAssembler, build_response}; + +/// Page `blob` end-to-end the way the client would, with a deliberately small chunk size to force +/// multiple ranges, and return the reassembled bytes. +fn page_and_reassemble(blob: &[u8], chunk: u32) -> Vec { + let mut assembler = ChunkAssembler::new(blob.len() as u64); + loop { + let response = build_response(blob, assembler.next_offset(), chunk); + if response.bytes.is_empty() { + break; + } + assembler.accept(response.offset, &response.bytes).expect("contiguous chunk"); + } + assembler.into_blob().expect("complete blob") +} + +#[test] +fn ledger_snapshot_roundtrip_serialize_chunk_verify_import() { + let dir = tempfile::tempdir().expect("tempdir"); + let genesis_state = UndeployedNetwork.genesis_state().to_vec(); + + // Initialize the arena from genesis in Separate mode. This sets the process-global + // `default_storage`, persists the genesis ledger, and returns the on-chain `StateKey` bytes + // (the tagged `TypedArenaKey`) — exactly what `pallet_midnight::StateKey` would hold. + let state_key = midnight_node_ledger::ledger_9::storage::init_storage_paritydb_separate( + dir.path(), + &genesis_state, + 1024, + ); + assert!(!state_key.is_empty(), "genesis init must produce a StateKey"); + + // Server side (M1.2): serialize the `Ledger`-rooted snapshot at that StateKey. + let blob = midnight_node_ledger::serialize_ledger_snapshot(false, &state_key) + .expect("serialize ledger snapshot"); + assert!(blob.len() > state_key.len(), "snapshot blob should carry the arena, not just the key"); + + // Transport (M1.1): page into 4 KiB ranges and reassemble; must be byte-identical. + let reassembled = page_and_reassemble(&blob, 4096); + assert_eq!(reassembled, blob, "reassembled blob must be byte-identical to the server's"); + + // Client/import (M1.3 + M1.4): verify root == StateKey and persist. Idempotent against the + // already-initialized arena (content-addressed; genesis nodes dedup). + midnight_node_ledger::import_verified_ledger_snapshot(false, &reassembled, &state_key) + .expect("verified import of a faithful snapshot should succeed"); + + // Security property: tamper a byte well past the tag prefix (in the node-data region). The + // native multi-pass deserializer / root check must reject it — never a successful import. + let mut tampered = blob.clone(); + let idx = tampered.len() / 2; + tampered[idx] ^= 0xFF; + let result = + midnight_node_ledger::import_verified_ledger_snapshot(false, &tampered, &state_key); + assert!( + result.is_err(), + "a tampered snapshot must fail verification and not be imported, got {result:?}" + ); +} diff --git a/node/src/warp_ledger_sync/mod.rs b/node/src/warp_ledger_sync/mod.rs new file mode 100644 index 000000000..94ac6e7c3 --- /dev/null +++ b/node/src/warp_ledger_sync/mod.rs @@ -0,0 +1,85 @@ +// This file is part of midnight-node. +// Copyright (C) Midnight Foundation +// SPDX-License-Identifier: Apache-2.0 +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Trustless warp-sync extension for Midnight. +//! +//! Standard Substrate warp-sync recovers headers, GRANDPA finality, and the runtime state trie, +//! but **not** the Midnight ledger arena (the content-addressed blob behind +//! `pallet_midnight::StateKey`, which lives outside the trie). A warp-synced node therefore holds a +//! valid `StateKey` pointing into an empty arena and fails on the next block. This module adds a +//! side request/response protocol that recovers the arena after warp+state-sync completes, with +//! full cryptographic verification against the `StateKey` the trie already recovered. +//! +//! See `warp-ledger-sync-spec.md` for the full design. Module map: +//! - [`protocol`] — wire message types, codec, protocol naming, range serving + reassembly. +//! - [`server`] — serves the `Ledger`-rooted arena blob at a finalized target block as byte ranges. +//! - [`client`] — fetches the blob from peers and hands it to the ledger crate for verification + +//! import. Verification against the on-chain `StateKey` and persistence live in +//! `midnight_node_ledger::import_verified_ledger_snapshot`, which reuses the arena's **native** +//! multi-pass deserializer (`Arena::deserialize_sp`) for untrusted input rather than a bespoke +//! re-hash, then asserts the recomputed root equals `StateKey` before persisting in-process +//! (`alloc`/`persist`/`flush`, no restart — see `warp-ledger-sync-m1.4a-spike.md`). +//! - [`monitor`] — detects warp completion, captures the target block, drives [`client`], releases +//! the gate. [`oracle`] keeps AURA from authoring until recovery is verified. + +pub mod block_import; +pub mod client; +pub mod monitor; +pub mod oracle; +pub mod protocol; +pub mod server; + +#[cfg(test)] +mod integration_tests; + +use parity_scale_codec::Decode; +use sc_client_api::{Backend, StorageKey, StorageProvider}; +use sp_runtime::traits::Block as BlockT; + +/// Log target shared by the warp ledger-sync server and client. +pub(crate) const LOG_TARGET: &str = "midnight-ledger-sync"; + +/// Raw storage key for `pallet_midnight::StateKey`: `twox_128("Midnight") ++ twox_128("StateKey")`. +/// No runtime API exposes `StateKey` (it has only a `#[pallet::getter]`), so both the server and +/// client read it by raw key. +pub(crate) fn state_key_storage_key() -> StorageKey { + let mut key = Vec::with_capacity(32); + key.extend_from_slice(&sp_core::twox_128(b"Midnight")); + key.extend_from_slice(&sp_core::twox_128(b"StateKey")); + StorageKey(key) +} + +/// Read and decode the on-chain `StateKey` at `hash` — the tagged `TypedArenaKey` bytes the +/// ledger arena is keyed by. The storage value is `SCALE(Vec)`; the inner bytes are the key. +/// Returns `Ok(None)` if the pallet has no `StateKey` at that block. +pub(crate) fn read_state_key( + client: &Client, + hash: B::Hash, +) -> Result>, sp_blockchain::Error> +where + B: BlockT, + BE: Backend, + Client: StorageProvider, +{ + match client.storage(hash, &state_key_storage_key())? { + Some(raw) => { + let inner = Vec::::decode(&mut &raw.0[..]).map_err(|e| { + sp_blockchain::Error::Backend(format!("failed to decode pallet StateKey: {e}")) + })?; + Ok(Some(inner)) + }, + None => Ok(None), + } +} diff --git a/node/src/warp_ledger_sync/monitor.rs b/node/src/warp_ledger_sync/monitor.rs new file mode 100644 index 000000000..e1628b0e3 --- /dev/null +++ b/node/src/warp_ledger_sync/monitor.rs @@ -0,0 +1,120 @@ +// This file is part of midnight-node. +// Copyright (C) Midnight Foundation +// SPDX-License-Identifier: Apache-2.0 +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! M2.1 — Warp-completion monitor & target capture. +//! +//! Spawned for the lifetime of the node. On a **full sync** the warp path is never observed and the +//! task exits without ever arming the gate. On a **warp sync** it arms the gate, waits for warp + +//! state-sync to finish, captures the target block N, drives the client driver to recover + verify + +//! import the ledger arena, then releases the gate so AURA may author. + +use std::{sync::Arc, time::Duration}; + +use sc_client_api::{Backend, StorageProvider}; +use sc_network::{NetworkRequest, ProtocolName}; +use sc_network_sync::SyncingService; +use sp_blockchain::HeaderBackend; +use sp_consensus::SyncOracle; +use sp_runtime::traits::Block as BlockT; + +use super::{LOG_TARGET, client::LedgerSyncClient, oracle::RecoveryGate}; + +/// How often to poll sync status. Short so we reliably observe `warp_sync == Some(..)` while warp +/// is in progress (warp takes many seconds), and arm the gate well before warp completes. +const POLL_INTERVAL: Duration = Duration::from_secs(2); +/// Backoff between failed full recovery attempts. +const RETRY_DELAY: Duration = Duration::from_secs(10); + +/// Run the recovery monitor to completion (warp path) or early exit (full sync). Intended to be +/// spawned as a non-essential task. +pub async fn run_recovery_monitor( + client: Arc, + sync_service: Arc>, + network: Arc, + gate: Arc, + protocol_name: ProtocolName, + unified: bool, +) where + B: BlockT, + BE: Backend + 'static, + Client: HeaderBackend + StorageProvider + Send + Sync + 'static, + Network: NetworkRequest + Send + Sync + ?Sized + 'static, +{ + // 1. Detect the warp path and wait for warp + state-sync to finish. We check status *before* + // sleeping so we observe `warp_sync == Some(..)` early (it stays `Some` throughout the + // multi-second warp), and arm the gate the moment warp is seen — so AURA is gated through the + // whole post-warp window (the inner oracle already gates during warp). + // + // Completion is keyed on `state_sync done + finalized_state present` rather than catching the + // exact `WarpSyncPhase::Complete` tick (which can be transient): once warp's state-sync has + // populated a finalized state, the trie anchor (`StateKey`) we verify against exists. + let mut saw_warp = false; + let (target_hash, target_number) = loop { + let status = sync_service.status().await.ok(); + + if let Some(status) = &status { + if status.warp_sync.is_some() && !saw_warp { + saw_warp = true; + gate.arm(); + log::info!( + target: LOG_TARGET, + "Warp sync detected; ledger arena recovery armed (authoring gated until verified)" + ); + } + } + + if saw_warp { + let state_sync_done = status.as_ref().map(|s| s.state_sync.is_none()).unwrap_or(false); + if state_sync_done { + if let Some(target) = client.info().finalized_state { + break target; + } + } + } else { + // Full-sync path: once the node is no longer major-syncing, ledger recovery is never + // needed — the arena was built block-by-block. Exit without arming. + if !sync_service.is_major_syncing() { + log::debug!(target: LOG_TARGET, "Full sync in progress; ledger arena recovery not required"); + return; + } + } + + tokio::time::sleep(POLL_INTERVAL).await; + }; + log::info!( + target: LOG_TARGET, + "Recovering ledger arena at warp target #{target_number} ({target_hash:?})" + ); + + // 3. Recover + verify + import, retrying across the current peer set until one succeeds. + let driver = LedgerSyncClient::new(client, network, protocol_name, unified); + loop { + let peers: Vec<_> = match sync_service.peers_info().await { + Ok(info) => info.into_iter().map(|(peer, _)| peer).collect(), + Err(_) => Vec::new(), + }; + match driver.recover(target_hash, &peers).await { + Ok(()) => break, + Err(e) => { + log::warn!(target: LOG_TARGET, "Ledger arena recovery attempt failed: {e}; retrying"); + tokio::time::sleep(RETRY_DELAY).await; + }, + } + } + + // 4. Release the gate: opens both the authoring oracle and the block-import gate. + gate.mark_ledger_verified(); + log::info!(target: LOG_TARGET, "Ledger arena recovered + verified; authoring + import gate released"); +} diff --git a/node/src/warp_ledger_sync/oracle.rs b/node/src/warp_ledger_sync/oracle.rs new file mode 100644 index 000000000..2e638373e --- /dev/null +++ b/node/src/warp_ledger_sync/oracle.rs @@ -0,0 +1,147 @@ +// This file is part of midnight-node. +// Copyright (C) Midnight Foundation +// SPDX-License-Identifier: Apache-2.0 +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! M2.2 — recovery gate + wrapping [`SyncOracle`]. +//! +//! The shared [`RecoveryGate`] is the single source of truth for "is the warp-recovered ledger +//! arena ready yet". It gates two things until recovery is verified: +//! - **block authoring**, via [`MidnightSyncOracle`] passed to AURA, and +//! - **block import**, via [`super::block_import::GatedBlockImport`] in the import queue — so the +//! node does not execute post-warp blocks against an empty arena (which would hit +//! `NoLedgerState`) or race the recovery writer. +//! +//! The gate is armed **only** when the monitor detects the warp path. On a full sync it stays +//! disarmed, so both gates are pure passthroughs and full-sync nodes are never affected. + +use std::sync::{ + Arc, + atomic::{AtomicBool, Ordering}, +}; + +use sp_consensus::SyncOracle; + +/// Shared recovery flags, flipped by the monitor task and read by the authoring oracle and the +/// block-import gate. Cloneable `Arc` handle so all observe the same state. +#[derive(Debug, Default)] +pub struct RecoveryGate { + /// Set true once the warp path is detected; while false the gate is a pure passthrough. + recovery_pending: AtomicBool, + /// Set true once the ledger arena is recovered and verified. + ledger_verified: AtomicBool, +} + +impl RecoveryGate { + /// A fresh gate: nothing pending (full-sync default). + pub fn new() -> Arc { + Arc::new(Self::default()) + } + + /// Arm the gate: the warp path was taken, so authoring + import must wait for verification. + pub fn arm(&self) { + self.recovery_pending.store(true, Ordering::Release); + } + + /// Mark the ledger arena verified + imported. Opens both gates. + pub fn mark_ledger_verified(&self) { + self.ledger_verified.store(true, Ordering::Release); + } + + /// Whether warp recovery is in progress: armed but not yet verified. Both the authoring oracle + /// and the import gate hold while this is true. + pub fn ledger_recovery_in_progress(&self) -> bool { + self.recovery_pending.load(Ordering::Acquire) + && !self.ledger_verified.load(Ordering::Acquire) + } +} + +/// Wraps the node's inner [`SyncOracle`] (the `SyncingService`) so AURA reports "still syncing" +/// (and therefore does not author) while the warp-recovered ledger arena is being recovered. +#[derive(Clone)] +pub struct MidnightSyncOracle { + inner: Inner, + gate: Arc, +} + +impl MidnightSyncOracle { + pub fn new(inner: Inner, gate: Arc) -> Self { + Self { inner, gate } + } +} + +impl SyncOracle for MidnightSyncOracle { + fn is_major_syncing(&self) -> bool { + self.inner.is_major_syncing() || self.gate.ledger_recovery_in_progress() + } + + fn is_offline(&self) -> bool { + self.inner.is_offline() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Minimal inner oracle whose state we control. + #[derive(Clone)] + struct MockOracle(Arc); + impl MockOracle { + fn new(major: bool) -> Self { + Self(Arc::new(AtomicBool::new(major))) + } + fn set(&self, v: bool) { + self.0.store(v, Ordering::Release); + } + } + impl SyncOracle for MockOracle { + fn is_major_syncing(&self) -> bool { + self.0.load(Ordering::Acquire) + } + fn is_offline(&self) -> bool { + false + } + } + + #[test] + fn full_sync_is_pure_passthrough() { + // Gate never armed (recovery_pending == false): behavior == inner. + let inner = MockOracle::new(true); + let oracle = MidnightSyncOracle::new(inner.clone(), RecoveryGate::new()); + assert!(oracle.is_major_syncing(), "delegates to inner while inner is syncing"); + inner.set(false); + assert!(!oracle.is_major_syncing(), "not gated on full sync once inner is done"); + } + + #[test] + fn warp_node_gated_until_ledger_verified() { + let inner = MockOracle::new(false); // inner already finished warp+state-sync + let gate = RecoveryGate::new(); + let oracle = MidnightSyncOracle::new(inner, gate.clone()); + + gate.arm(); + assert!(oracle.is_major_syncing(), "armed + not verified -> gated"); + assert!(gate.ledger_recovery_in_progress()); + + gate.mark_ledger_verified(); + assert!(!oracle.is_major_syncing(), "verified -> released"); + assert!(!gate.ledger_recovery_in_progress()); + } + + #[test] + fn is_offline_always_delegates() { + let oracle = MidnightSyncOracle::new(MockOracle::new(true), RecoveryGate::new()); + assert!(!oracle.is_offline()); + } +} diff --git a/node/src/warp_ledger_sync/protocol.rs b/node/src/warp_ledger_sync/protocol.rs new file mode 100644 index 000000000..adfb157d0 --- /dev/null +++ b/node/src/warp_ledger_sync/protocol.rs @@ -0,0 +1,293 @@ +// This file is part of midnight-node. +// Copyright (C) Midnight Foundation +// SPDX-License-Identifier: Apache-2.0 +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! M1.1 — Ledger-sync protocol message types, codec, naming, and the pure range-serving / +//! reassembly logic shared by the server (M1.2) and client (M1.3). +//! +//! The transferred payload is the canonical, `Ledger`-rooted arena blob (derived tag prefix ‖ +//! `TopoSortedNodes` of the `Ledger` DAG — see spec ODD-1). Transport pages it by **byte offset** +//! (not by semantic node): the server streams contiguous byte ranges and the client concatenates +//! them in order before deserialize + verify. The children-precede-parents property is intrinsic +//! to the serialized blob, so in-order byte concatenation preserves it automatically (ODD-3). + +use parity_scale_codec::{Decode, Encode}; + +/// Protocol name suffix; the full name is `/{genesis_hash}[/{fork_id}]/midnight-ledger-sync/1`. +pub const PROTOCOL_NAME_SUFFIX: &str = "midnight-ledger-sync/1"; + +/// Maximum number of bytes a single response chunk may carry. The server clamps a peer's +/// requested `max_len` to this; the network layer's `max_response_size` must be ≥ this plus codec +/// overhead. 1 MiB matches substrate's state-sync chunking. +pub const MAX_LEDGER_SYNC_CHUNK: u32 = 1024 * 1024; + +/// Request a contiguous byte range of the `Ledger`-rooted arena blob at `target_hash`. +/// +/// `target_hash` must be a finalized block whose state-sync target the server can serve (M1.2 +/// rejects non-finalized / unknown blocks). `offset`/`max_len` page the blob; `max_len` is clamped +/// server-side to [`MAX_LEDGER_SYNC_CHUNK`]. +#[derive(Debug, Clone, PartialEq, Eq, Encode, Decode)] +pub struct LedgerSyncRequest { + /// Finalized target block whose arena snapshot is requested. + pub target_hash: Hash, + /// Byte offset into the canonical blob to start from. + pub offset: u64, + /// Maximum number of bytes to return (clamped to [`MAX_LEDGER_SYNC_CHUNK`] by the server). + pub max_len: u32, +} + +/// A contiguous byte range of the canonical `Ledger`-rooted blob. +/// +/// `total_len` is the full blob size (lets the client learn the size up front and drive parallel / +/// resumable range fetches); `offset`/`bytes` are this chunk. +#[derive(Debug, Clone, PartialEq, Eq, Encode, Decode)] +pub struct LedgerSyncResponse { + /// Total length of the full canonical blob at the target block. + pub total_len: u64, + /// Byte offset of this chunk within the full blob. + pub offset: u64, + /// The chunk bytes: `blob[offset .. offset + bytes.len()]`. + pub bytes: Vec, +} + +/// Build the full ledger-sync protocol name from a genesis hash and optional fork id, mirroring +/// substrate's `/{hex_genesis}[/{fork}]/state/2` convention. +pub fn ledger_sync_protocol_name>( + genesis_hash: Hash, + fork_id: Option<&str>, +) -> String { + let genesis = hex::encode(genesis_hash.as_ref()); + match fork_id { + Some(fork) => format!("/{genesis}/{fork}/{PROTOCOL_NAME_SUFFIX}"), + None => format!("/{genesis}/{PROTOCOL_NAME_SUFFIX}"), + } +} + +/// Clamp a peer-requested `max_len` to the server limit. +pub fn clamp_max_len(requested: u32) -> u32 { + requested.min(MAX_LEDGER_SYNC_CHUNK) +} + +/// Build a response chunk for `[offset, offset + clamp(max_len))` of `blob` (server side, M1.2). +/// +/// Clamps `max_len`, never reads past the end of the blob, and yields an empty chunk if `offset` +/// is at or past the end (which signals completion to the client). +pub fn build_response(blob: &[u8], offset: u64, max_len: u32) -> LedgerSyncResponse { + let total_len = blob.len() as u64; + let start = offset.min(total_len); + let avail = total_len - start; + let len = (clamp_max_len(max_len) as u64).min(avail); + let start = start as usize; + let end = start + len as usize; + LedgerSyncResponse { total_len, offset, bytes: blob[start..end].to_vec() } +} + +/// Errors from reassembling response chunks into the full blob (client side, M1.3). +#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)] +pub enum AssembleError { + /// A chunk did not start where the previous one ended. Chunks must be fed in order; parallel + /// fetches must be reordered by `offset` before being accepted. + #[error("non-contiguous chunk: expected offset {expected}, got {got}")] + NonContiguous { + /// The offset the assembler expected next (its current filled length). + expected: u64, + /// The offset the chunk actually carried. + got: u64, + }, + /// A chunk would extend the blob past the advertised `total_len`. + #[error("chunk overflows total_len {total}: offset {offset} + len {len}")] + Overflow { + /// Advertised total length of the blob. + total: u64, + /// Offset of the offending chunk. + offset: u64, + /// Length of the offending chunk. + len: u64, + }, + /// `into_blob` was called before all bytes were received. + #[error("incomplete blob: have {have} of {total} bytes")] + Incomplete { + /// Bytes received so far. + have: u64, + /// Total bytes expected. + total: u64, + }, +} + +/// Reassembles ordered, contiguous response chunks into the full canonical blob. +/// +/// In-order contiguous assembly is sufficient and simplest: a chunk is accepted only if its +/// `offset` equals the bytes received so far. Parallel / multi-peer fetches are allowed (spec +/// ODD-3) but the client must reorder chunks by `offset` before feeding them here. The assembled +/// blob is verified against the on-chain `StateKey` by M1.3 — this type does no crypto, only +/// transport-level reassembly. +#[derive(Debug)] +pub struct ChunkAssembler { + total_len: u64, + // Grown incrementally rather than pre-allocated to `total_len`, so a malicious peer + // advertising a huge `total_len` cannot force a large up-front allocation. + buf: Vec, +} + +impl ChunkAssembler { + /// Start assembling a blob of `total_len` bytes (learned from the first response). + pub fn new(total_len: u64) -> Self { + Self { total_len, buf: Vec::new() } + } + + /// The offset the next chunk must start at (the number of bytes received so far). Use this to + /// drive the next [`LedgerSyncRequest`] and to support resume after interruption. + pub fn next_offset(&self) -> u64 { + self.buf.len() as u64 + } + + /// Accept the next contiguous chunk. Returns an error (and leaves the assembler unchanged) if + /// the chunk is out of order or would overflow `total_len`. + pub fn accept(&mut self, offset: u64, bytes: &[u8]) -> Result<(), AssembleError> { + let expected = self.buf.len() as u64; + if offset != expected { + return Err(AssembleError::NonContiguous { expected, got: offset }); + } + if offset + bytes.len() as u64 > self.total_len { + return Err(AssembleError::Overflow { + total: self.total_len, + offset, + len: bytes.len() as u64, + }); + } + self.buf.extend_from_slice(bytes); + Ok(()) + } + + /// Whether all `total_len` bytes have been received. + pub fn is_complete(&self) -> bool { + self.buf.len() as u64 == self.total_len + } + + /// Consume the assembler and return the full blob, or [`AssembleError::Incomplete`] if bytes + /// are still missing. + pub fn into_blob(self) -> Result, AssembleError> { + if !self.is_complete() { + return Err(AssembleError::Incomplete { + have: self.buf.len() as u64, + total: self.total_len, + }); + } + Ok(self.buf) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use sp_core::H256; + + #[test] + fn request_response_scale_roundtrip() { + let req = + LedgerSyncRequest { target_hash: H256::repeat_byte(0xab), offset: 4096, max_len: 1234 }; + let decoded = LedgerSyncRequest::::decode(&mut &req.encode()[..]).unwrap(); + assert_eq!(req, decoded); + + let resp = + LedgerSyncResponse { total_len: 9_999, offset: 4096, bytes: vec![1, 2, 3, 4, 5] }; + let decoded = LedgerSyncResponse::decode(&mut &resp.encode()[..]).unwrap(); + assert_eq!(resp, decoded); + } + + #[test] + fn protocol_name_with_and_without_fork() { + let genesis = H256::repeat_byte(0x01); + let hex = hex::encode(genesis.as_ref()); + assert_eq!( + ledger_sync_protocol_name(genesis, None), + format!("/{hex}/midnight-ledger-sync/1") + ); + assert_eq!( + ledger_sync_protocol_name(genesis, Some("forkz")), + format!("/{hex}/forkz/midnight-ledger-sync/1") + ); + } + + #[test] + fn clamp_respects_limit() { + assert_eq!(clamp_max_len(10), 10); + assert_eq!(clamp_max_len(MAX_LEDGER_SYNC_CHUNK + 1), MAX_LEDGER_SYNC_CHUNK); + assert_eq!(clamp_max_len(u32::MAX), MAX_LEDGER_SYNC_CHUNK); + } + + #[test] + fn build_response_clamps_and_bounds() { + let blob: Vec = (0..=255u8).cycle().take(5000).collect(); + + // A normal interior range. + let r = build_response(&blob, 1000, 500); + assert_eq!(r.total_len, 5000); + assert_eq!(r.offset, 1000); + assert_eq!(r.bytes, &blob[1000..1500]); + + // max_len past the end is truncated to the tail. + let r = build_response(&blob, 4800, 1000); + assert_eq!(r.bytes, &blob[4800..5000]); + + // offset at/past the end yields an empty chunk (completion signal). + let r = build_response(&blob, 5000, 100); + assert!(r.bytes.is_empty()); + assert_eq!(r.total_len, 5000); + let r = build_response(&blob, 9999, 100); + assert!(r.bytes.is_empty()); + } + + #[test] + fn assembler_reassembles_byte_identical() { + let blob: Vec = (0..=255u8).cycle().take(5000).collect(); + + // Page the blob the way the client would: repeated build_response calls following + // `next_offset`, fed into the assembler in order. + let mut asm = ChunkAssembler::new(blob.len() as u64); + loop { + let chunk = build_response(&blob, asm.next_offset(), 700); + if chunk.bytes.is_empty() { + break; + } + asm.accept(chunk.offset, &chunk.bytes).unwrap(); + } + assert!(asm.is_complete()); + assert_eq!(asm.into_blob().unwrap(), blob); + } + + #[test] + fn assembler_rejects_non_contiguous() { + let mut asm = ChunkAssembler::new(100); + asm.accept(0, &[0u8; 10]).unwrap(); + // Gap: next expected offset is 10, not 20. + assert_eq!( + asm.accept(20, &[0u8; 10]), + Err(AssembleError::NonContiguous { expected: 10, got: 20 }) + ); + // State unchanged after a rejected chunk. + assert_eq!(asm.next_offset(), 10); + } + + #[test] + fn assembler_rejects_overflow_and_incomplete() { + let mut asm = ChunkAssembler::new(16); + assert_eq!( + asm.accept(0, &[0u8; 32]), + Err(AssembleError::Overflow { total: 16, offset: 0, len: 32 }) + ); + asm.accept(0, &[0u8; 8]).unwrap(); + assert_eq!(asm.into_blob(), Err(AssembleError::Incomplete { have: 8, total: 16 })); + } +} diff --git a/node/src/warp_ledger_sync/server.rs b/node/src/warp_ledger_sync/server.rs new file mode 100644 index 000000000..7626cecab --- /dev/null +++ b/node/src/warp_ledger_sync/server.rs @@ -0,0 +1,170 @@ +// This file is part of midnight-node. +// Copyright (C) Midnight Foundation +// SPDX-License-Identifier: Apache-2.0 +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! M1.2 — Ledger-sync server handler. +//! +//! Answers [`LedgerSyncRequest`]s from warp-syncing peers by serializing this (fully synced) node's +//! `Ledger`-rooted arena snapshot at the requested finalized block and serving the requested byte +//! range. Patterned on substrate's `state_request_handler.rs`. +//! +//! Verification is the *client's* job (M1.3): the server is untrusted, so it performs no crypto — +//! it only serves bytes whose recomputed root the client checks against the on-chain `StateKey`. + +use std::{marker::PhantomData, sync::Arc, time::Duration}; + +use futures::StreamExt; +use parity_scale_codec::{Decode, Encode}; +use sc_client_api::{Backend, StorageProvider}; +use sc_network::{ + MAX_RESPONSE_SIZE, NetworkBackend, + request_responses::{IncomingRequest, OutgoingResponse}, +}; +use sp_blockchain::HeaderBackend; +use sp_runtime::traits::{Block as BlockT, Header as HeaderT}; + +use super::{ + LOG_TARGET, + protocol::{LedgerSyncRequest, build_response, ledger_sync_protocol_name}, + read_state_key, +}; + +/// Max bytes a peer may put in a single request (the request is tiny: a hash + two integers). +const MAX_REQUEST_SIZE: u64 = 1024; +/// Request timeout, matching substrate's state protocol. +const REQUEST_TIMEOUT: Duration = Duration::from_secs(40); + +/// Handler for incoming ledger-sync requests from warp-syncing peers. +/// +/// Memoizes the serialized blob for the most-recently-served target block so that the many +/// byte-range requests a single client makes while paging the blob do not each re-serialize the +/// (multi-million node) arena. +pub struct LedgerSyncRequestHandler { + client: Arc, + /// Whether the ledger arena uses the unified ParityDb layout (selects the DB instantiation the + /// serializer dispatches to — see [`midnight_node_ledger::serialize_ledger_snapshot`]). + unified: bool, + request_receiver: async_channel::Receiver, + /// `(target_block, serialized blob)` memo for the last block served. + cache: Option<(B::Hash, Arc>)>, + _phantom: PhantomData, +} + +impl LedgerSyncRequestHandler +where + B: BlockT, + BE: Backend + 'static, + Client: HeaderBackend + StorageProvider + Send + Sync + 'static, +{ + /// Build the handler and the protocol config to register on `net_config` before + /// `build_network`. Spawn [`run`](Self::run) as a task. + pub fn new::Hash>>( + genesis_hash: B::Hash, + fork_id: Option<&str>, + client: Arc, + unified: bool, + num_peer_hint: usize, + ) -> (Self, N::RequestResponseProtocolConfig) { + // Reserve one in-flight request slot per peer. + let capacity = std::cmp::max(num_peer_hint, 1); + let (tx, request_receiver) = async_channel::bounded(capacity); + + let config = N::request_response_config( + ledger_sync_protocol_name(genesis_hash, fork_id).into(), + Vec::new(), + MAX_REQUEST_SIZE, + MAX_RESPONSE_SIZE, + REQUEST_TIMEOUT, + Some(tx), + ); + + (Self { client, unified, request_receiver, cache: None, _phantom: PhantomData }, config) + } + + /// Run the request-handling loop until the inbound queue closes. + pub async fn run(mut self) { + while let Some(IncomingRequest { peer, payload, pending_response }) = + self.request_receiver.next().await + { + let result = match self.handle_request(&payload) { + Ok(bytes) => Ok(bytes), + Err(e) => { + log::debug!(target: LOG_TARGET, "ledger-sync request from {peer} failed: {e}"); + Err(()) + }, + }; + // A failed send just means the peer disconnected; nothing to do. + let _ = pending_response.send(OutgoingResponse { + result, + reputation_changes: Vec::new(), + sent_feedback: None, + }); + } + } + + fn handle_request(&mut self, payload: &[u8]) -> Result, HandleError> { + let req = LedgerSyncRequest::::decode(&mut &payload[..])?; + let blob = self.blob_for(req.target_hash)?; + Ok(build_response(&blob, req.offset, req.max_len).encode()) + } + + /// Return the serialized `Ledger`-rooted blob for `target`, building and memoizing it on a + /// cache miss. Rejects unknown or not-yet-finalized blocks. + fn blob_for(&mut self, target: B::Hash) -> Result>, HandleError> { + if let Some((cached, blob)) = &self.cache { + if *cached == target { + return Ok(blob.clone()); + } + } + + // Only serve finalized blocks whose state we hold: an unknown hash or a block beyond our + // finalized number is rejected (the warp target is always finalized — spec §7). + let header = self.client.header(target)?.ok_or(HandleError::UnknownBlock)?; + if *header.number() > self.client.info().finalized_number { + return Err(HandleError::NotFinalized); + } + + // Read the raw `pallet_midnight::StateKey` at the target block. + let state_key = read_state_key::(&self.client, target)? + .ok_or(HandleError::NoStateKey)?; + + let blob = midnight_node_ledger::serialize_ledger_snapshot(self.unified, &state_key) + .map_err(|e| HandleError::Serialize(format!("{e:?}")))?; + log::debug!( + target: LOG_TARGET, + "Serialized ledger snapshot for {target:?}: {} bytes", + blob.len() + ); + + let blob = Arc::new(blob); + self.cache = Some((target, blob.clone())); + Ok(blob) + } +} + +#[derive(Debug, thiserror::Error)] +enum HandleError { + #[error("failed to decode request / state key: {0}")] + Decode(#[from] parity_scale_codec::Error), + #[error("blockchain error: {0}")] + Client(#[from] sp_blockchain::Error), + #[error("requested block is unknown")] + UnknownBlock, + #[error("requested block is not finalized")] + NotFinalized, + #[error("pallet StateKey not present at requested block")] + NoStateKey, + #[error("failed to serialize ledger snapshot: {0}")] + Serialize(String), +} From 6b8f16a79ba66be70d290cc39ab33faab33d18e9 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Fri, 5 Jun 2026 15:11:50 -0700 Subject: [PATCH 02/13] deadlock fix Signed-off-by: Justin Frevert --- node/src/warp_ledger_sync/block_import.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/node/src/warp_ledger_sync/block_import.rs b/node/src/warp_ledger_sync/block_import.rs index 36876b134..1177638b1 100644 --- a/node/src/warp_ledger_sync/block_import.rs +++ b/node/src/warp_ledger_sync/block_import.rs @@ -65,7 +65,14 @@ where } async fn import_block(&self, block: BlockImportParams) -> Result { - if self.gate.ledger_recovery_in_progress() { + // `with_state()` is true only for the state-sync target block, whose state is *imported* + // (`StateAction::ApplyChanges(StorageChanges::Import)`) — no runtime execution, so no arena + // access. That import MUST be allowed even while recovery is pending: state sync has to + // complete before the monitor can recover the arena (gating it would deadlock — + // state-sync waits on import, import waits on `ledger_verified`, which waits on recovery, + // which waits on state-sync). Only blocks that *execute* against the arena (post-warp + // blocks N+1…, `with_state() == false`) are held until recovery is verified. + if !block.with_state() && self.gate.ledger_recovery_in_progress() { log::debug!( target: LOG_TARGET, "Holding block import until the warp-recovered ledger arena is verified" From 007228446671a89b940c5f9b85638e665e2ac967 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Sun, 7 Jun 2026 16:44:10 -0700 Subject: [PATCH 03/13] Fix GatedBlockImport behavior Signed-off-by: Justin Frevert --- node/src/warp_ledger_sync/block_import.rs | 62 ++++++++++++----------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/node/src/warp_ledger_sync/block_import.rs b/node/src/warp_ledger_sync/block_import.rs index 1177638b1..77ba0c26a 100644 --- a/node/src/warp_ledger_sync/block_import.rs +++ b/node/src/warp_ledger_sync/block_import.rs @@ -22,24 +22,32 @@ //! arena — hitting `NoLedgerState`, and worse, racing the recovery writer (the arena is //! single-writer; see `warp-ledger-sync-m1.4a-spike.md`). //! -//! [`GatedBlockImport`] wraps the import queue's block import and **holds** `import_block` while -//! [`RecoveryGate::ledger_recovery_in_progress`] is true, so no block executes against the arena -//! until recovery is verified. On a full sync the gate is never armed, so this is a pure -//! passthrough. Recovery does not depend on block import (it fetches over a side protocol from -//! already-connected peers), so holding the import worker cannot deadlock recovery. - -use std::{sync::Arc, time::Duration}; +//! [`GatedBlockImport`] wraps the import queue's block import and, while +//! [`RecoveryGate::ledger_recovery_in_progress`] is true, **rejects** the import of blocks that +//! would execute against the arena, with a transient [`ConsensusError::ClientImport`] error so the +//! sync engine re-requests them once recovery completes. +//! +//! Two things are critical and were learned the hard way: +//! 1. **Reject, don't block.** The import queue has a single worker; an `import_block` that *awaits* +//! until recovery would occupy that worker and starve the state-sync target-block import below, +//! deadlocking warp (state sync waits on the worker ← held by the gated block ← waits on +//! recovery ← waits on state sync). Returning an error frees the worker immediately. +//! 2. **Never gate the state-sync target block.** `with_state()` is true only for that block (its +//! state is *imported*, not executed — no arena access), and state sync must import it *before* +//! the monitor can recover the arena. Gating it would deadlock for the same reason. +//! +//! On a full sync the gate is never armed, so this is a pure passthrough. use sc_consensus::{BlockCheckParams, BlockImport, BlockImportParams, ImportResult}; +use sp_consensus::Error as ConsensusError; use sp_runtime::traits::Block as BlockT; -use super::{LOG_TARGET, oracle::RecoveryGate}; +use std::sync::Arc; -/// How often to re-check the gate while holding a block import. -const POLL_INTERVAL: Duration = Duration::from_millis(200); +use super::oracle::RecoveryGate; -/// Wraps an inner [`BlockImport`], deferring `import_block` until the warp-recovered ledger arena is -/// verified (see module docs). +/// Wraps an inner [`BlockImport`], rejecting `import_block` for arena-executing blocks until the +/// warp-recovered ledger arena is verified (see module docs). #[derive(Clone)] pub struct GatedBlockImport { inner: Inner, @@ -56,31 +64,27 @@ impl GatedBlockImport { impl BlockImport for GatedBlockImport where B: BlockT, - Inner: BlockImport + Send + Sync, + Inner: BlockImport + Send + Sync, { - type Error = Inner::Error; + type Error = ConsensusError; async fn check_block(&self, block: BlockCheckParams) -> Result { self.inner.check_block(block).await } async fn import_block(&self, block: BlockImportParams) -> Result { - // `with_state()` is true only for the state-sync target block, whose state is *imported* - // (`StateAction::ApplyChanges(StorageChanges::Import)`) — no runtime execution, so no arena - // access. That import MUST be allowed even while recovery is pending: state sync has to - // complete before the monitor can recover the arena (gating it would deadlock — - // state-sync waits on import, import waits on `ledger_verified`, which waits on recovery, - // which waits on state-sync). Only blocks that *execute* against the arena (post-warp - // blocks N+1…, `with_state() == false`) are held until recovery is verified. + // Defer only execution-bearing blocks (post-warp blocks). The state-sync target block carries + // imported state (`with_state()` true, no runtime execution) and must always be let through — + // recovery can't even start until state sync imports it. if !block.with_state() && self.gate.ledger_recovery_in_progress() { - log::debug!( - target: LOG_TARGET, - "Holding block import until the warp-recovered ledger arena is verified" - ); - while self.gate.ledger_recovery_in_progress() { - tokio::time::sleep(POLL_INTERVAL).await; - } - log::debug!(target: LOG_TARGET, "Ledger arena verified; resuming block import"); + // Return MissingState (not an Err): the ledger arena this block needs to execute isn't + // recovered yet. substrate treats MissingState as "obsolete, not bad" — it does NOT drop + // the peer and does NOT restart sync, and the block is re-requested by normal sync once + // recovery completes. Returning an Err instead maps to `BlockImportError::Other`, which + // triggers `chain_sync.restart()` on every deferred block (a restart-storm that churns + // peers); awaiting would instead block the single import-queue worker and deadlock the + // state-sync target import. MissingState avoids both. + return Ok(ImportResult::MissingState); } self.inner.import_block(block).await } From 16dc9483bd57eabe21c3842450aa6b547ad522e5 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Tue, 9 Jun 2026 21:51:29 -0700 Subject: [PATCH 04/13] Remove spec guide references Signed-off-by: Justin Frevert --- ledger/src/lib.rs | 9 ++++----- ledger/src/versions/common/mod.rs | 11 +++++------ node/src/service.rs | 6 +++--- node/src/warp_ledger_sync/block_import.rs | 2 +- node/src/warp_ledger_sync/client.rs | 8 ++++---- node/src/warp_ledger_sync/integration_tests.rs | 14 +++++++------- node/src/warp_ledger_sync/mod.rs | 4 ++-- node/src/warp_ledger_sync/monitor.rs | 2 +- node/src/warp_ledger_sync/oracle.rs | 2 +- node/src/warp_ledger_sync/protocol.rs | 16 ++++++++-------- node/src/warp_ledger_sync/server.rs | 4 ++-- 11 files changed, 38 insertions(+), 40 deletions(-) diff --git a/ledger/src/lib.rs b/ledger/src/lib.rs index be2acb455..2cac6d57b 100644 --- a/ledger/src/lib.rs +++ b/ledger/src/lib.rs @@ -138,7 +138,7 @@ pub fn drop_all_default_storage() { #[cfg(feature = "std")] /// Serialize the ledger arena snapshot at `state_key` into the canonical, `Ledger`-rooted warp -/// transfer blob (trustless warp ledger-sync, M1.2 server side). +/// transfer blob (trustless warp ledger-sync, server side). /// /// `unified` selects the ParityDb instantiation, matching the operator's `storage_separation` /// config: the two modes register `default_storage` under different `D` type ids (separate = column @@ -169,7 +169,7 @@ pub fn serialize_ledger_snapshot( } /// Failure modes of [`import_verified_ledger_snapshot`]. All are non-fatal to the chain: the caller -/// discards the data, reports the peer, and retries from another (warp spec M4.1). +/// discards the data, reports the peer, and retries from another. #[cfg(feature = "std")] #[derive(Debug)] pub enum SnapshotImportError { @@ -207,12 +207,11 @@ impl std::error::Error for SnapshotImportError {} #[cfg(feature = "std")] /// Verify a `Ledger`-rooted warp snapshot `blob` against the on-chain `expected_state_key` and, on /// success, persist it into the already-open arena backend so `get_lazy(StateKey)` resolves (warp -/// ledger-sync M1.3 verification + M1.4 import). `unified` selects the DB instantiation, as in +/// ledger-sync verification + import). `unified` selects the DB instantiation, as in /// [`serialize_ledger_snapshot`]. Uses the latest ledger version (`ledger_9`) — same near-tip /// assumption noted there. /// -/// The caller must hold the authoring/import gate (the arena is single-writer) — see -/// `warp-ledger-sync-m1.4a-spike.md`. +/// The caller must hold the authoring/import gate (the arena is single-writer). pub fn import_verified_ledger_snapshot( unified: bool, blob: &[u8], diff --git a/ledger/src/versions/common/mod.rs b/ledger/src/versions/common/mod.rs index cae6b1f64..131ad6896 100644 --- a/ledger/src/versions/common/mod.rs +++ b/ledger/src/versions/common/mod.rs @@ -737,7 +737,7 @@ where /// at `Ledger` (the `Sp` from `get_ledger` is an `Sp`) rather than `LedgerState` — see /// `warp-ledger-sync-spec.md` ODD-1. Because the blob is rooted at `Ledger`, its recomputed /// content-address root key equals the on-chain `pallet_midnight::StateKey`, which is exactly - /// what the client verifies against (M1.3). The tag prefix is **derived** + /// what the client verifies against. The tag prefix is **derived** /// (`GLOBAL_TAG ‖ ::tag()`), never hardcoded (spec §8 format-lockstep). pub fn serialize_ledger_snapshot(state_key: &[u8]) -> Result, LedgerApiError> { use ledger_storage_local::arena::TopoSortedNodes; @@ -771,11 +771,10 @@ where /// malicious or faulty peer can at worst cause a rejected import (→ peer report + retry by the /// caller), never state corruption. /// - /// Persists + flushes into the live `default_storage` so `get_lazy(StateKey)` resolves. The - /// lifecycle (in-process, no restart, same `alloc`/`persist`/`flush` path live block execution - /// uses) was validated in `warp-ledger-sync-m1.4a-spike.md`. The caller (warp client driver, - /// M1.3) MUST hold the authoring/import gate so no block executes against the arena concurrently - /// — the arena is single-writer. + /// Persists + flushes into the live `default_storage` so `get_lazy(StateKey)` resolves — + /// in-process, no restart, via the same `alloc`/`persist`/`flush` path live block execution + /// uses. The caller (warp client driver) MUST hold the authoring/import gate so no block + /// executes against the arena concurrently — the arena is single-writer. pub fn import_verified_ledger_snapshot( blob: &[u8], expected_state_key: &[u8], diff --git a/node/src/service.rs b/node/src/service.rs index f989627b9..978f99d26 100644 --- a/node/src/service.rs +++ b/node/src/service.rs @@ -565,7 +565,7 @@ pub async fn new_full Result<(), ClientError> { let state_key = read_state_key::(&self.client, target)? .ok_or(ClientError::NoStateKey)?; @@ -92,7 +92,7 @@ where // Verification happens inside the importer (root must equal `state_key`); a verify // failure means the peer served bad data — discard and try the next one. - // M4.1: a reputation report on the peer belongs here. + // A reputation report on the peer belongs here (deferred). match midnight_node_ledger::import_verified_ledger_snapshot( self.unified, &blob, diff --git a/node/src/warp_ledger_sync/integration_tests.rs b/node/src/warp_ledger_sync/integration_tests.rs index e1ed88ccb..2ccbda2de 100644 --- a/node/src/warp_ledger_sync/integration_tests.rs +++ b/node/src/warp_ledger_sync/integration_tests.rs @@ -13,10 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! In-process round-trip test for the warp ledger-sync core (M1), with no networking: -//! init a real arena from genesis → serialize the `Ledger`-rooted snapshot (server, M1.2) → page it -//! through the transport chunker + reassembler (M1.1) → import + verify against the on-chain -//! `StateKey` (client/import, M1.3/M1.4). Also asserts the security property: a tampered blob is +//! In-process round-trip test for the warp ledger-sync core, with no networking: +//! init a real arena from genesis → serialize the `Ledger`-rooted snapshot (server) → page it +//! through the transport chunker + reassembler → import + verify against the on-chain +//! `StateKey` (client/import). Also asserts the security property: a tampered blob is //! rejected (`RootMismatch`), never imported. //! //! Run isolated (it touches the process-global `default_storage` singleton): @@ -55,16 +55,16 @@ fn ledger_snapshot_roundtrip_serialize_chunk_verify_import() { ); assert!(!state_key.is_empty(), "genesis init must produce a StateKey"); - // Server side (M1.2): serialize the `Ledger`-rooted snapshot at that StateKey. + // Server side: serialize the `Ledger`-rooted snapshot at that StateKey. let blob = midnight_node_ledger::serialize_ledger_snapshot(false, &state_key) .expect("serialize ledger snapshot"); assert!(blob.len() > state_key.len(), "snapshot blob should carry the arena, not just the key"); - // Transport (M1.1): page into 4 KiB ranges and reassemble; must be byte-identical. + // Transport: page into 4 KiB ranges and reassemble; must be byte-identical. let reassembled = page_and_reassemble(&blob, 4096); assert_eq!(reassembled, blob, "reassembled blob must be byte-identical to the server's"); - // Client/import (M1.3 + M1.4): verify root == StateKey and persist. Idempotent against the + // Client/import: verify root == StateKey and persist. Idempotent against the // already-initialized arena (content-addressed; genesis nodes dedup). midnight_node_ledger::import_verified_ledger_snapshot(false, &reassembled, &state_key) .expect("verified import of a faithful snapshot should succeed"); diff --git a/node/src/warp_ledger_sync/mod.rs b/node/src/warp_ledger_sync/mod.rs index 94ac6e7c3..bc610de2d 100644 --- a/node/src/warp_ledger_sync/mod.rs +++ b/node/src/warp_ledger_sync/mod.rs @@ -22,7 +22,7 @@ //! side request/response protocol that recovers the arena after warp+state-sync completes, with //! full cryptographic verification against the `StateKey` the trie already recovered. //! -//! See `warp-ledger-sync-spec.md` for the full design. Module map: +//! Module map: //! - [`protocol`] — wire message types, codec, protocol naming, range serving + reassembly. //! - [`server`] — serves the `Ledger`-rooted arena blob at a finalized target block as byte ranges. //! - [`client`] — fetches the blob from peers and hands it to the ledger crate for verification + @@ -30,7 +30,7 @@ //! `midnight_node_ledger::import_verified_ledger_snapshot`, which reuses the arena's **native** //! multi-pass deserializer (`Arena::deserialize_sp`) for untrusted input rather than a bespoke //! re-hash, then asserts the recomputed root equals `StateKey` before persisting in-process -//! (`alloc`/`persist`/`flush`, no restart — see `warp-ledger-sync-m1.4a-spike.md`). +//! (`alloc`/`persist`/`flush`, no restart). //! - [`monitor`] — detects warp completion, captures the target block, drives [`client`], releases //! the gate. [`oracle`] keeps AURA from authoring until recovery is verified. diff --git a/node/src/warp_ledger_sync/monitor.rs b/node/src/warp_ledger_sync/monitor.rs index e1628b0e3..2dd3d6e5f 100644 --- a/node/src/warp_ledger_sync/monitor.rs +++ b/node/src/warp_ledger_sync/monitor.rs @@ -13,7 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! M2.1 — Warp-completion monitor & target capture. +//! Warp-completion monitor & target capture. //! //! Spawned for the lifetime of the node. On a **full sync** the warp path is never observed and the //! task exits without ever arming the gate. On a **warp sync** it arms the gate, waits for warp + diff --git a/node/src/warp_ledger_sync/oracle.rs b/node/src/warp_ledger_sync/oracle.rs index 2e638373e..70c112b9c 100644 --- a/node/src/warp_ledger_sync/oracle.rs +++ b/node/src/warp_ledger_sync/oracle.rs @@ -13,7 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! M2.2 — recovery gate + wrapping [`SyncOracle`]. +//! Recovery gate + wrapping [`SyncOracle`]. //! //! The shared [`RecoveryGate`] is the single source of truth for "is the warp-recovered ledger //! arena ready yet". It gates two things until recovery is verified: diff --git a/node/src/warp_ledger_sync/protocol.rs b/node/src/warp_ledger_sync/protocol.rs index adfb157d0..52893b077 100644 --- a/node/src/warp_ledger_sync/protocol.rs +++ b/node/src/warp_ledger_sync/protocol.rs @@ -13,8 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! M1.1 — Ledger-sync protocol message types, codec, naming, and the pure range-serving / -//! reassembly logic shared by the server (M1.2) and client (M1.3). +//! Ledger-sync protocol message types, codec, naming, and the pure range-serving / +//! reassembly logic shared by the server and client. //! //! The transferred payload is the canonical, `Ledger`-rooted arena blob (derived tag prefix ‖ //! `TopoSortedNodes` of the `Ledger` DAG — see spec ODD-1). Transport pages it by **byte offset** @@ -34,8 +34,8 @@ pub const MAX_LEDGER_SYNC_CHUNK: u32 = 1024 * 1024; /// Request a contiguous byte range of the `Ledger`-rooted arena blob at `target_hash`. /// -/// `target_hash` must be a finalized block whose state-sync target the server can serve (M1.2 -/// rejects non-finalized / unknown blocks). `offset`/`max_len` page the blob; `max_len` is clamped +/// `target_hash` must be a finalized block whose state-sync target the server can serve (the +/// server rejects non-finalized / unknown blocks). `offset`/`max_len` page the blob; `max_len` is clamped /// server-side to [`MAX_LEDGER_SYNC_CHUNK`]. #[derive(Debug, Clone, PartialEq, Eq, Encode, Decode)] pub struct LedgerSyncRequest { @@ -79,7 +79,7 @@ pub fn clamp_max_len(requested: u32) -> u32 { requested.min(MAX_LEDGER_SYNC_CHUNK) } -/// Build a response chunk for `[offset, offset + clamp(max_len))` of `blob` (server side, M1.2). +/// Build a response chunk for `[offset, offset + clamp(max_len))` of `blob` (server side). /// /// Clamps `max_len`, never reads past the end of the blob, and yields an empty chunk if `offset` /// is at or past the end (which signals completion to the client). @@ -93,7 +93,7 @@ pub fn build_response(blob: &[u8], offset: u64, max_len: u32) -> LedgerSyncRespo LedgerSyncResponse { total_len, offset, bytes: blob[start..end].to_vec() } } -/// Errors from reassembling response chunks into the full blob (client side, M1.3). +/// Errors from reassembling response chunks into the full blob (client side). #[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)] pub enum AssembleError { /// A chunk did not start where the previous one ended. Chunks must be fed in order; parallel @@ -130,8 +130,8 @@ pub enum AssembleError { /// In-order contiguous assembly is sufficient and simplest: a chunk is accepted only if its /// `offset` equals the bytes received so far. Parallel / multi-peer fetches are allowed (spec /// ODD-3) but the client must reorder chunks by `offset` before feeding them here. The assembled -/// blob is verified against the on-chain `StateKey` by M1.3 — this type does no crypto, only -/// transport-level reassembly. +/// blob is verified against the on-chain `StateKey` by the client driver — this type does no +/// crypto, only transport-level reassembly. #[derive(Debug)] pub struct ChunkAssembler { total_len: u64, diff --git a/node/src/warp_ledger_sync/server.rs b/node/src/warp_ledger_sync/server.rs index 7626cecab..d4026b5ee 100644 --- a/node/src/warp_ledger_sync/server.rs +++ b/node/src/warp_ledger_sync/server.rs @@ -13,13 +13,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! M1.2 — Ledger-sync server handler. +//! Ledger-sync server handler. //! //! Answers [`LedgerSyncRequest`]s from warp-syncing peers by serializing this (fully synced) node's //! `Ledger`-rooted arena snapshot at the requested finalized block and serving the requested byte //! range. Patterned on substrate's `state_request_handler.rs`. //! -//! Verification is the *client's* job (M1.3): the server is untrusted, so it performs no crypto — +//! Verification is the *client's* job: the server is untrusted, so it performs no crypto — //! it only serves bytes whose recomputed root the client checks against the on-chain `StateKey`. use std::{marker::PhantomData, sync::Arc, time::Duration}; From da5e3246f21b679378291a123950978767fd59e7 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Sun, 7 Jun 2026 20:43:59 -0700 Subject: [PATCH 05/13] feat(warp-sync): dispatch arena serialize/import/genesis-init by ledger-state version Warp serialize (server), import (client), and genesis-arena-init hardcoded ledger_9 (LedgerState v16). A fresh node syncing onto a network governed by an older ledger version (e.g. a real devnet whose genesis+tip arena is v13/ledger_8) then panics: genesis-init can't deserialize the v13 genesis_state, and serve/recover would mismatch. Parse the 'ledger-state[vNN]' tag from the StateKey / genesis_state and dispatch to the matching compiled-in module (v5->ledger_7, v13->ledger_8, v16->ledger_9). Resolves the deferred per-version dispatch (review finding #5). Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Justin Frevert (cherry picked from commit 85db77673ae5ac14de295072de8aa042f8c0695e) --- ledger/src/lib.rs | 127 +++++++++++++++++---------- node/src/backend/custom_parity_db.rs | 65 ++++++++++++-- node/src/warp_ledger_sync/server.rs | 2 +- 3 files changed, 137 insertions(+), 57 deletions(-) diff --git a/ledger/src/lib.rs b/ledger/src/lib.rs index 2cac6d57b..1f3a96ae4 100644 --- a/ledger/src/lib.rs +++ b/ledger/src/lib.rs @@ -136,35 +136,59 @@ pub fn drop_all_default_storage() { ledger_9::storage::drop_default_storage_if_exists(); } +/// Parse the `vNN` from a `ledger-state[vNN]` tag embedded in a tagged blob (a `StateKey` or a +/// genesis_state). Used to dispatch warp serialize/import (and genesis-init) to the ledger module +/// whose `LedgerState` serialization matches: **v5 → `ledger_7`, v13 → `ledger_8`, v16 → `ledger_9`**. +/// A warp-syncing node can target a chain governed by an *older* ledger version than this build's +/// latest (e.g. a real devnet whose arena is still v13), so the version is read from the data, not +/// assumed to be the tip's. #[cfg(feature = "std")] +pub fn ledger_state_tag_version(tagged: &[u8]) -> Option { + const NEEDLE: &[u8] = b"ledger-state[v"; + let start = tagged.windows(NEEDLE.len()).position(|w| w == NEEDLE)? + NEEDLE.len(); + let rest = &tagged[start..]; + let end = rest.iter().position(|&b| b == b']')?; + core::str::from_utf8(&rest[..end]).ok()?.parse().ok() +} + +/// Expand to the `(DbSeparate, DbUnified)`-parameterized call of a `Bridge` arena method on the given +/// ledger version module (`ledger_7`/`ledger_8`/`ledger_9`), picking the DB instantiation by `unified`. +#[cfg(feature = "std")] +macro_rules! bridge_arena_call { + ($ver:ident, $unified:expr, $method:ident ( $($arg:expr),* )) => {{ + type DbSeparate = $ver::ledger_storage_local::db::ParityDb; + type DbUnified = $ver::ledger_storage_local::db::ParityDb< + sha2::Sha256, + $ver::ledger_storage_local::db::paritydb::OwnedDb, + { midnight_primitives_ledger::LedgerStorageExt::COLUMN_OFFSET }, + >; + if $unified { + $ver::Bridge::<$ver::TransactionSignature, DbUnified>::$method( $($arg),* ) + } else { + $ver::Bridge::<$ver::TransactionSignature, DbSeparate>::$method( $($arg),* ) + } + }}; +} + /// Serialize the ledger arena snapshot at `state_key` into the canonical, `Ledger`-rooted warp -/// transfer blob (trustless warp ledger-sync, server side). -/// -/// `unified` selects the ParityDb instantiation, matching the operator's `storage_separation` -/// config: the two modes register `default_storage` under different `D` type ids (separate = column -/// offset 0; unified = offset `NUM_COLUMNS_POLKADOT`, sharing substrate's parity-db). The blob bytes -/// are identical across modes. +/// transfer blob (trustless warp ledger-sync, server side). `unified` selects the ParityDb +/// instantiation (separate = column offset 0; unified = offset `NUM_COLUMNS_POLKADOT`); the blob is +/// identical across modes. /// -/// Uses the latest ledger version (`ledger_9`): warp-sync targets are near the chain tip, where the -/// active ledger version is the latest. (Assumption — deferred: a node warp-syncing to a block -/// governed by an *older* ledger version would need per-version dispatch here; not reachable today -/// since warp always targets the tip.) -pub fn serialize_ledger_snapshot( - unified: bool, - state_key: &[u8], -) -> Result, ledger_9::api::LedgerApiError> { - type Sig = ledger_9::TransactionSignature; - type DbSeparate = ledger_9::ledger_storage_local::db::ParityDb; - type DbUnified = ledger_9::ledger_storage_local::db::ParityDb< - sha2::Sha256, - ledger_9::ledger_storage_local::db::paritydb::OwnedDb, - { midnight_primitives_ledger::LedgerStorageExt::COLUMN_OFFSET }, - >; - - if unified { - ledger_9::Bridge::::serialize_ledger_snapshot(state_key) - } else { - ledger_9::Bridge::::serialize_ledger_snapshot(state_key) +/// Dispatches to the ledger module matching the `StateKey`'s `ledger-state[vNN]` tag (see +/// [`ledger_state_tag_version`]) — so a warp node can serve an arena governed by an older ledger +/// version than this build's latest. Error rendered to `String` (the underlying `LedgerApiError` is +/// version-specific). +#[cfg(feature = "std")] +pub fn serialize_ledger_snapshot(unified: bool, state_key: &[u8]) -> Result, String> { + match ledger_state_tag_version(state_key) { + Some(16) => bridge_arena_call!(ledger_9, unified, serialize_ledger_snapshot(state_key)) + .map_err(|e| format!("{e:?}")), + Some(13) => bridge_arena_call!(ledger_8, unified, serialize_ledger_snapshot(state_key)) + .map_err(|e| format!("{e:?}")), + Some(5) => bridge_arena_call!(ledger_7, unified, serialize_ledger_snapshot(state_key)) + .map_err(|e| format!("{e:?}")), + other => Err(format!("unsupported ledger-state version {other:?} in StateKey")), } } @@ -207,9 +231,9 @@ impl std::error::Error for SnapshotImportError {} #[cfg(feature = "std")] /// Verify a `Ledger`-rooted warp snapshot `blob` against the on-chain `expected_state_key` and, on /// success, persist it into the already-open arena backend so `get_lazy(StateKey)` resolves (warp -/// ledger-sync verification + import). `unified` selects the DB instantiation, as in -/// [`serialize_ledger_snapshot`]. Uses the latest ledger version (`ledger_9`) — same near-tip -/// assumption noted there. +/// ledger-sync verification + import). `unified` selects the DB instantiation, and dispatch on the +/// `StateKey`'s `ledger-state[vNN]` tag picks the ledger module, as in +/// [`serialize_ledger_snapshot`]. /// /// The caller must hold the authoring/import gate (the arena is single-writer). pub fn import_verified_ledger_snapshot( @@ -217,24 +241,33 @@ pub fn import_verified_ledger_snapshot( blob: &[u8], expected_state_key: &[u8], ) -> Result<(), SnapshotImportError> { - type Sig = ledger_9::TransactionSignature; - type DbSeparate = ledger_9::ledger_storage_local::db::ParityDb; - type DbUnified = ledger_9::ledger_storage_local::db::ParityDb< - sha2::Sha256, - ledger_9::ledger_storage_local::db::paritydb::OwnedDb, - { midnight_primitives_ledger::LedgerStorageExt::COLUMN_OFFSET }, - >; - - if unified { - ledger_9::Bridge::::import_verified_ledger_snapshot( - blob, - expected_state_key, - ) - } else { - ledger_9::Bridge::::import_verified_ledger_snapshot( - blob, - expected_state_key, - ) + // Dispatch on the `StateKey`'s ledger-state version (the underlying method returns the shared + // `SnapshotImportError` for every version, so no error mapping is needed). + match ledger_state_tag_version(expected_state_key) { + Some(16) => { + bridge_arena_call!( + ledger_9, + unified, + import_verified_ledger_snapshot(blob, expected_state_key) + ) + }, + Some(13) => { + bridge_arena_call!( + ledger_8, + unified, + import_verified_ledger_snapshot(blob, expected_state_key) + ) + }, + Some(5) => { + bridge_arena_call!( + ledger_7, + unified, + import_verified_ledger_snapshot(blob, expected_state_key) + ) + }, + other => Err(SnapshotImportError::StateKeyDecode(format!( + "unsupported ledger-state version {other:?} in StateKey" + ))), } } diff --git a/node/src/backend/custom_parity_db.rs b/node/src/backend/custom_parity_db.rs index 5cd5ab729..08a628c1b 100644 --- a/node/src/backend/custom_parity_db.rs +++ b/node/src/backend/custom_parity_db.rs @@ -107,20 +107,67 @@ pub fn open>( let db = Arc::new(parity_db::Db::open_or_create(&config)?); + // Dispatch genesis-arena-init on the genesis_state's `ledger-state[vNN]` tag, not this build's + // latest version. A network genesis'd with an older ledger version (e.g. a real devnet whose + // genesis arena is still v13/ledger_8) must be initialised with the matching deserializer, or the + // init panics with a tag-version mismatch. (This is also what lets a fresh node warp-sync onto such + // a network: the genesis arena is set up under the right version, then warp recovery overwrites it + // with the verified target arena.) + let genesis_state = &storage_config.genesis_state; + let genesis_version = midnight_node_ledger::ledger_state_tag_version(genesis_state); match storage_config.separation { StorageSeparation::Separate => { - midnight_node_ledger::ledger_9::storage::init_storage_paritydb_separate( - &storage_config.db_path, - &storage_config.genesis_state, - storage_config.cache_size, - ); + let dir = &storage_config.db_path; + let cache = storage_config.cache_size; + match genesis_version { + Some(16) => { + midnight_node_ledger::ledger_9::storage::init_storage_paritydb_separate( + dir, + genesis_state, + cache, + ); + }, + Some(13) => { + midnight_node_ledger::ledger_8::storage::init_storage_paritydb_separate( + dir, + genesis_state, + cache, + ); + }, + Some(5) => { + midnight_node_ledger::ledger_7::storage::init_storage_paritydb_separate( + dir, + genesis_state, + cache, + ); + }, + other => panic!("unsupported genesis ledger-state version {other:?}"), + } Ok((OwnedDb(db), LedgerStorageDb::SeparateDb(storage_config.db_path.clone()))) }, StorageSeparation::Unified => { - midnight_node_ledger::ledger_9::storage::init_storage_paritydb_unified::< - _, - NUM_COLUMNS_POLKADOT, - >(OwnedDb(db.clone()), &storage_config.genesis_state, storage_config.cache_size); + let cache = storage_config.cache_size; + match genesis_version { + Some(16) => { + midnight_node_ledger::ledger_9::storage::init_storage_paritydb_unified::< + _, + NUM_COLUMNS_POLKADOT, + >(OwnedDb(db.clone()), genesis_state, cache); + }, + Some(13) => { + midnight_node_ledger::ledger_8::storage::init_storage_paritydb_unified::< + _, + NUM_COLUMNS_POLKADOT, + >(OwnedDb(db.clone()), genesis_state, cache); + }, + Some(5) => { + midnight_node_ledger::ledger_7::storage::init_storage_paritydb_unified::< + _, + NUM_COLUMNS_POLKADOT, + >(OwnedDb(db.clone()), genesis_state, cache); + }, + other => panic!("unsupported genesis ledger-state version {other:?}"), + } Ok((OwnedDb(db.clone()), LedgerStorageDb::UnifiedDb(db.clone()))) }, } diff --git a/node/src/warp_ledger_sync/server.rs b/node/src/warp_ledger_sync/server.rs index d4026b5ee..530cffbde 100644 --- a/node/src/warp_ledger_sync/server.rs +++ b/node/src/warp_ledger_sync/server.rs @@ -140,7 +140,7 @@ where .ok_or(HandleError::NoStateKey)?; let blob = midnight_node_ledger::serialize_ledger_snapshot(self.unified, &state_key) - .map_err(|e| HandleError::Serialize(format!("{e:?}")))?; + .map_err(HandleError::Serialize)?; log::debug!( target: LOG_TARGET, "Serialized ledger snapshot for {target:?}: {} bytes", From 79fea9a0d2032c9dd89f3a644915b52cc2d91237 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Sun, 7 Jun 2026 18:36:52 -0700 Subject: [PATCH 06/13] fix(warp-sync): recover from network-connected peers, not sync peers The recovery monitor sourced peers from SyncingService::peers_info(), which chain_sync.restart() empties on benign post-warp UnknownParent announcements, stalling 1000-scale recovery with "no peers" while libp2p connections were still up. Source candidate peers from the network layer (connected + reserved peers, which survive restarts), falling back to sync peers only if none. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Justin Frevert (cherry picked from commit 930aa4ef249541df370ee9e58c5215cea38a5c4a) --- node/src/warp_ledger_sync/monitor.rs | 47 +++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/node/src/warp_ledger_sync/monitor.rs b/node/src/warp_ledger_sync/monitor.rs index 2dd3d6e5f..6288b25c0 100644 --- a/node/src/warp_ledger_sync/monitor.rs +++ b/node/src/warp_ledger_sync/monitor.rs @@ -23,7 +23,7 @@ use std::{sync::Arc, time::Duration}; use sc_client_api::{Backend, StorageProvider}; -use sc_network::{NetworkRequest, ProtocolName}; +use sc_network::{NetworkPeers, NetworkRequest, NetworkStatusProvider, PeerId, ProtocolName}; use sc_network_sync::SyncingService; use sp_blockchain::HeaderBackend; use sp_consensus::SyncOracle; @@ -50,7 +50,7 @@ pub async fn run_recovery_monitor( B: BlockT, BE: Backend + 'static, Client: HeaderBackend + StorageProvider + Send + Sync + 'static, - Network: NetworkRequest + Send + Sync + ?Sized + 'static, + Network: NetworkRequest + NetworkStatusProvider + NetworkPeers + Send + Sync + ?Sized + 'static, { // 1. Detect the warp path and wait for warp + state-sync to finish. We check status *before* // sleeping so we observe `warp_sync == Some(..)` early (it stays `Some` throughout the @@ -99,12 +99,9 @@ pub async fn run_recovery_monitor( ); // 3. Recover + verify + import, retrying across the current peer set until one succeeds. - let driver = LedgerSyncClient::new(client, network, protocol_name, unified); + let driver = LedgerSyncClient::new(client, network.clone(), protocol_name, unified); loop { - let peers: Vec<_> = match sync_service.peers_info().await { - Ok(info) => info.into_iter().map(|(peer, _)| peer).collect(), - Err(_) => Vec::new(), - }; + let peers = recovery_candidate_peers(&*network, &sync_service).await; match driver.recover(target_hash, &peers).await { Ok(()) => break, Err(e) => { @@ -118,3 +115,39 @@ pub async fn run_recovery_monitor( gate.mark_ledger_verified(); log::info!(target: LOG_TARGET, "Ledger arena recovered + verified; authoring + import gate released"); } + +/// Gather candidate peers to recover the ledger arena from, sourced from the **network** layer +/// (currently-connected libp2p peers + reserved nodes) rather than [`SyncingService::peers_info`]. +/// +/// The sync-peer list is emptied by `chain_sync.restart()`, which benign post-warp `UnknownParent` +/// block announcements trigger repeatedly once the servers are producing every 6s. A monitor that +/// reads sync peers therefore sees "no peers" while the libp2p connections (and any reserved nodes) +/// are still fully up — the cause of the 1000-scale recovery stall. Network-level peers survive that +/// churn. `peers_info()` is kept only as a last-resort fallback if the network layer reports none. +async fn recovery_candidate_peers( + network: &Network, + sync_service: &SyncingService, +) -> Vec +where + B: BlockT, + Network: NetworkStatusProvider + NetworkPeers + ?Sized, +{ + let mut peers = std::collections::HashSet::new(); + + // Currently-connected libp2p peers (the `connected_peers` map is keyed by the base58 PeerId). + if let Ok(state) = network.network_state().await { + peers.extend(state.connected_peers.keys().filter_map(|id| id.parse::().ok())); + } + // Reserved nodes pinned by the operator (e.g. `--reserved-nodes`) also survive sync restarts. + if let Ok(reserved) = network.reserved_peers().await { + peers.extend(reserved); + } + // Fallback: only if the network layer reported nothing usable. + if peers.is_empty() { + if let Ok(info) = sync_service.peers_info().await { + peers.extend(info.into_iter().map(|(peer, _)| peer)); + } + } + + peers.into_iter().collect() +} From 26041882690f06024cc0001c73f52b7f665f80f6 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Wed, 10 Jun 2026 07:55:02 -0700 Subject: [PATCH 07/13] fix(warp-sync): hold execute-bearing imports during arena recovery instead of dropping them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A warp-synced node never progressed past its target on a real network (observed: pinned at the warp target for 9+ hours against a devnet fork, ~8,400 sync req/s). Three root causes, all fixed here: 1. GatedBlockImport returned Ok(ImportResult::MissingState) for gated blocks. substrate silently drops such blocks while chain_sync's best_queued_number stays advanced, so peer ancestor searches can only match genesis (the warp gap has no headers) and "common too far behind" re-arms the search forever; peers stuck in AncestorSearch ignore announcements, starving tip AND gap sync permanently. (Returning Err instead is no better: each restart re-issues an identical block request and BlockRequestHandler reputation-bans the node — "Same block request multiple times".) Fix: defer by awaiting the recovery gate inside import_block. This cannot deadlock under the new gate scope (below): a gated block can only exist after the state-sync target imported, and recovery uses only the client + request-response network, never the import queue. 2. The gate covered every !with_state() block, including gap-sync (block-history) blocks, which import with skip_execution and never touch the arena. Gate now defers only blocks that would execute: StateAction::Execute, or ExecuteIfPossible with parent state present. 3. The monitor re-armed the gate and re-downloaded the whole arena on every restart of a post-warp node, because chain_sync reports an active gap sync as warp_sync: Some(DownloadingBlocks). The monitor now decides by arena content at startup via the new midnight_node_ledger::has_ledger_state (cheap root lookup, per-version dispatch): present -> skip, absent -> recover. Validated end-to-end against a local-environment fork of a real devnet ledger_8 snapshot (6 validators, mock authorities): fresh --sync warp node went genesis -> fully synced (state + verified 7.3MB arena + full 554k block history) in ~83s, follows the tip with finality, container restart does not re-gate; 0 import errors, 0 NoLedgerState, 0 peer bans. Assisted-by: Claude:claude-fable-5 Signed-off-by: Justin Frevert (cherry picked from commit 9332cf5ed9f471a1e29a3cfceea480e04f25f17b) --- ledger/src/lib.rs | 18 +++ node/src/service.rs | 1 + node/src/warp_ledger_sync/block_import.rs | 138 ++++++++++++++++------ node/src/warp_ledger_sync/monitor.rs | 96 ++++++++++++++- node/src/warp_ledger_sync/oracle.rs | 8 ++ 5 files changed, 224 insertions(+), 37 deletions(-) diff --git a/ledger/src/lib.rs b/ledger/src/lib.rs index 1f3a96ae4..dd26c7339 100644 --- a/ledger/src/lib.rs +++ b/ledger/src/lib.rs @@ -192,6 +192,24 @@ pub fn serialize_ledger_snapshot(unified: bool, state_key: &[u8]) -> Result bool { + match ledger_state_tag_version(state_key) { + Some(16) => bridge_arena_call!(ledger_9, unified, get_ledger_state_root(state_key)).is_ok(), + Some(13) => bridge_arena_call!(ledger_8, unified, get_ledger_state_root(state_key)).is_ok(), + Some(5) => bridge_arena_call!(ledger_7, unified, get_ledger_state_root(state_key)).is_ok(), + _ => false, + } +} + /// Failure modes of [`import_verified_ledger_snapshot`]. All are non-fatal to the chain: the caller /// discards the data, reports the peer, and retries from another. #[cfg(feature = "std")] diff --git a/node/src/service.rs b/node/src/service.rs index 978f99d26..71a5db566 100644 --- a/node/src/service.rs +++ b/node/src/service.rs @@ -430,6 +430,7 @@ pub fn new_partial( let gated_block_import = crate::warp_ledger_sync::block_import::GatedBlockImport::new( grandpa_block_import.clone(), recovery_gate.clone(), + backend.clone(), ); let import_queue = partner_chains_aura_import_queue::import_queue::< diff --git a/node/src/warp_ledger_sync/block_import.rs b/node/src/warp_ledger_sync/block_import.rs index e2740097f..7c3fd80a2 100644 --- a/node/src/warp_ledger_sync/block_import.rs +++ b/node/src/warp_ledger_sync/block_import.rs @@ -23,47 +23,118 @@ //! single-writer). //! //! [`GatedBlockImport`] wraps the import queue's block import and, while -//! [`RecoveryGate::ledger_recovery_in_progress`] is true, **rejects** the import of blocks that -//! would execute against the arena, with a transient [`ConsensusError::ClientImport`] error so the -//! sync engine re-requests them once recovery completes. +//! [`RecoveryGate::ledger_recovery_in_progress`] is true, **rejects** the import of exactly those +//! blocks that would *execute* against the arena, with a [`ConsensusError::ClientImport`] error. //! -//! Two things are critical and were learned the hard way: -//! 1. **Reject, don't block.** The import queue has a single worker; an `import_block` that *awaits* -//! until recovery would occupy that worker and starve the state-sync target-block import below, -//! deadlocking warp (state sync waits on the worker ← held by the gated block ← waits on -//! recovery ← waits on state sync). Returning an error frees the worker immediately. -//! 2. **Never gate the state-sync target block.** `with_state()` is true only for that block (its -//! state is *imported*, not executed — no arena access), and state sync must import it *before* -//! the monitor can recover the arena. Gating it would deadlock for the same reason. +//! ## What is gated (and what must never be) +//! +//! A block import only touches the arena if it executes the runtime, so the gate keys off +//! [`StateAction`]: +//! - `ApplyChanges` — **pass**. The state-sync target block (its state is *imported*, not +//! executed). State sync must import it *before* recovery can even start; gating it deadlocks +//! warp. (Also locally-authored blocks, but authoring is separately gated by the oracle.) +//! - `Skip` — **pass**. Gap-sync (block-history) blocks are downloaded with `skip_execution: +//! true`: they import headers/bodies only and never execute. Gating them silently broke block +//! history download after warp. +//! - `Execute` — **defer** (would unconditionally execute). +//! - `ExecuteIfPossible` — defer **iff the parent state is present** (that is the exact condition +//! under which the client executes the block; with the parent state pruned/absent it imports +//! without execution and never touches the arena). Post-warp, the only blocks with a present +//! parent state are the descendants of the warp target — precisely the dangerous ones. +//! +//! ## Why defer = `await` the gate (and why the scoping above makes that safe) +//! +//! Every alternative was tried and failed in a distinct, live-observed way: +//! 1. **`Ok(ImportResult::MissingState)` → permanent sync wedge.** substrate treats `MissingState` +//! as "obsolete, not bad": no peer drop, no restart — and *nothing else*. But `chain_sync` +//! already advanced `best_queued_number` when it queued the blocks, so after the silent swallow, +//! sync believes blocks exist that are not in the DB. Peer ancestor searches then probe +//! `best_queued` (miss — never imported) and descend into the warp gap `1..target` (miss — +//! headers absent), resolving the common block to **genesis**; `block_requests()` deems common=0 +//! "too far behind" and immediately restarts the ancestor search, forever. Observed live as a +//! ~8,000 req/s ancestry hot loop with all peers stuck in `AncestorSearch` (announcements +//! ignored, no block or gap requests, node pinned at the warp target for 9+ hours). +//! 2. **`Err(ClientImport)` → reputation-banned by every peer.** The error maps to +//! `BlockImportError::Other` → `chain_sync.restart()`, which *does* keep `best_queued_number` +//! consistent — but each restart re-issues an **identical** block request (same start, same +//! count), and substrate's `BlockRequestHandler` bans peers that repeat the same request +//! (`Same block request multiple times`, rep = i32::MIN, disconnect). Within a minute all +//! serving peers ban the warp node; with the connections gone, even the arena fetch starves +//! (`Refused`), so recovery itself can wedge. Observed live: 53 restarts, 72 ban/reconnect +//! cycles, 0 sync peers. +//! 3. **`await` until released — correct, *given the `would_execute` scoping*.** Holding the +//! import-queue worker used to deadlock when the gate covered all `!with_state()` blocks (the +//! state-sync target import could end up queued behind a held block). But a block can only be +//! `would_execute` *after* its ancestor — the state-sync target — has already imported (before +//! that, no parent state exists anywhere), and arena recovery itself uses only the client + +//! request-response network, never the import queue. So nothing recovery depends on can sit +//! behind the await. Each block is requested **once** (no duplicate-request bans), queued +//! blocks match sync's bookkeeping (no wedge), download read-ahead is bounded by substrate's +//! `MAX_DOWNLOAD_AHEAD`, and when the gate opens the worker simply drains the backlog in order. //! //! On a full sync the gate is never armed, so this is a pure passthrough. -use sc_consensus::{BlockCheckParams, BlockImport, BlockImportParams, ImportResult}; +use sc_client_api::Backend; +use sc_consensus::{BlockCheckParams, BlockImport, BlockImportParams, ImportResult, StateAction}; use sp_consensus::Error as ConsensusError; -use sp_runtime::traits::Block as BlockT; +use sp_runtime::traits::{Block as BlockT, Header as HeaderT, One, Saturating}; -use std::sync::Arc; +use std::{marker::PhantomData, sync::Arc}; use super::oracle::RecoveryGate; -/// Wraps an inner [`BlockImport`], rejecting `import_block` for arena-executing blocks until the -/// warp-recovered ledger arena is verified (see module docs). -#[derive(Clone)] -pub struct GatedBlockImport { +/// Wraps an inner [`BlockImport`], deferring (with a transient error) the import of blocks that +/// would execute against the ledger arena until the warp-recovered arena is verified (see module +/// docs). +pub struct GatedBlockImport { inner: Inner, gate: Arc, + backend: Arc, + _phantom: PhantomData, +} + +impl Clone for GatedBlockImport { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + gate: self.gate.clone(), + backend: self.backend.clone(), + _phantom: PhantomData, + } + } } -impl GatedBlockImport { - pub fn new(inner: Inner, gate: Arc) -> Self { - Self { inner, gate } +impl GatedBlockImport { + pub fn new(inner: Inner, gate: Arc, backend: Arc) -> Self { + Self { inner, gate, backend, _phantom: PhantomData } + } +} + +impl GatedBlockImport +where + B: BlockT, + BE: Backend, +{ + /// Whether importing `block` would execute the runtime (and therefore touch the ledger arena). + /// See module docs for the per-`StateAction` reasoning. + fn would_execute(&self, block: &BlockImportParams) -> bool { + match block.state_action { + StateAction::ApplyChanges(_) | StateAction::Skip => false, + StateAction::Execute => true, + StateAction::ExecuteIfPossible => { + let parent_hash = *block.header.parent_hash(); + let parent_number = block.header.number().clone().saturating_sub(One::one()); + self.backend.have_state_at(parent_hash, parent_number) + }, + } } } #[async_trait::async_trait] -impl BlockImport for GatedBlockImport +impl BlockImport for GatedBlockImport where B: BlockT, + BE: Backend + 'static, Inner: BlockImport + Send + Sync, { type Error = ConsensusError; @@ -73,18 +144,17 @@ where } async fn import_block(&self, block: BlockImportParams) -> Result { - // Defer only execution-bearing blocks (post-warp blocks). The state-sync target block carries - // imported state (`with_state()` true, no runtime execution) and must always be let through — - // recovery can't even start until state sync imports it. - if !block.with_state() && self.gate.ledger_recovery_in_progress() { - // Return MissingState (not an Err): the ledger arena this block needs to execute isn't - // recovered yet. substrate treats MissingState as "obsolete, not bad" — it does NOT drop - // the peer and does NOT restart sync, and the block is re-requested by normal sync once - // recovery completes. Returning an Err instead maps to `BlockImportError::Other`, which - // triggers `chain_sync.restart()` on every deferred block (a restart-storm that churns - // peers); awaiting would instead block the single import-queue worker and deadlock the - // state-sync target import. MissingState avoids both. - return Ok(ImportResult::MissingState); + if self.gate.ledger_recovery_in_progress() && self.would_execute(&block) { + // Hold the import (and with it the import-queue worker) until the arena is recovered — + // safe because nothing recovery depends on goes through the import queue once a + // `would_execute` block exists; see module docs ("Why defer = await"). + log::debug!( + target: super::LOG_TARGET, + "Holding import of #{} ({:?}) until ledger arena recovery completes", + block.header.number(), + block.post_hash(), + ); + self.gate.wait_until_released().await; } self.inner.import_block(block).await } diff --git a/node/src/warp_ledger_sync/monitor.rs b/node/src/warp_ledger_sync/monitor.rs index 6288b25c0..57ac0f983 100644 --- a/node/src/warp_ledger_sync/monitor.rs +++ b/node/src/warp_ledger_sync/monitor.rs @@ -27,7 +27,7 @@ use sc_network::{NetworkPeers, NetworkRequest, NetworkStatusProvider, PeerId, Pr use sc_network_sync::SyncingService; use sp_blockchain::HeaderBackend; use sp_consensus::SyncOracle; -use sp_runtime::traits::Block as BlockT; +use sp_runtime::traits::{Block as BlockT, NumberFor}; use super::{LOG_TARGET, client::LedgerSyncClient, oracle::RecoveryGate}; @@ -52,6 +52,61 @@ pub async fn run_recovery_monitor( Client: HeaderBackend + StorageProvider + Send + Sync + 'static, Network: NetworkRequest + NetworkStatusProvider + NetworkPeers + Send + Sync + ?Sized + 'static, { + // 0. Restart / already-synced fast path. If the DB already holds a finalized state at boot, + // this is not a fresh warp: it is either a normally-synced node or a *restart* of a + // previously warp-synced one. Recovery is then needed exactly when the local arena is + // missing the ledger state behind the on-chain `StateKey` at the finalized block (e.g. the + // node was killed mid-recovery). Deciding by the arena — not by sync status — also avoids a + // trap: `chain_sync` reports an active *gap* (block-history) sync as `warp_sync: + // Some(DownloadingBlocks)`, so a status-based check re-detects "warp" and needlessly re-gates + // + re-downloads the arena on every restart until the gap is filled. + if let Some((finalized_hash, finalized_number)) = client.info().finalized_state { + match super::read_state_key::(&client, finalized_hash) { + Ok(Some(state_key)) => { + if midnight_node_ledger::has_ledger_state(unified, &state_key) { + log::debug!( + target: LOG_TARGET, + "Ledger arena already holds the state at finalized #{finalized_number}; no warp recovery needed" + ); + return; + } + log::info!( + target: LOG_TARGET, + "Ledger arena is missing the state at finalized #{finalized_number} \ + (restart during an incomplete warp recovery?); recovering (authoring + import gated)" + ); + gate.arm(); + recover_and_release( + client, + sync_service, + network, + gate, + protocol_name, + unified, + finalized_hash, + finalized_number, + ) + .await; + return; + }, + Ok(None) => { + log::debug!( + target: LOG_TARGET, + "No pallet StateKey at finalized #{finalized_number}; ledger recovery not applicable" + ); + return; + }, + Err(e) => { + // Can't read the trie at the finalized block — fall through to the warp-detection + // loop rather than guessing. + log::warn!( + target: LOG_TARGET, + "Failed to read StateKey at finalized #{finalized_number}: {e}; falling back to warp detection" + ); + }, + } + } + // 1. Detect the warp path and wait for warp + state-sync to finish. We check status *before* // sleeping so we observe `warp_sync == Some(..)` early (it stays `Some` throughout the // multi-second warp), and arm the gate the moment warp is seen — so AURA is gated through the @@ -93,12 +148,45 @@ pub async fn run_recovery_monitor( tokio::time::sleep(POLL_INTERVAL).await; }; + + recover_and_release( + client, + sync_service, + network, + gate, + protocol_name, + unified, + target_hash, + target_number, + ) + .await; +} + +/// Recover + verify + import the arena at the given target (retrying across the current peer set +/// until one succeeds), then release the gate. Shared by the fresh-warp path and the +/// restarted-mid-recovery path. +#[allow(clippy::too_many_arguments)] +async fn recover_and_release( + client: Arc, + sync_service: Arc>, + network: Arc, + gate: Arc, + protocol_name: ProtocolName, + unified: bool, + target_hash: B::Hash, + target_number: NumberFor, +) where + B: BlockT, + BE: Backend + 'static, + Client: HeaderBackend + StorageProvider + Send + Sync + 'static, + Network: NetworkRequest + NetworkStatusProvider + NetworkPeers + Send + Sync + ?Sized + 'static, +{ log::info!( target: LOG_TARGET, "Recovering ledger arena at warp target #{target_number} ({target_hash:?})" ); - // 3. Recover + verify + import, retrying across the current peer set until one succeeds. + // Recover + verify + import, retrying across the current peer set until one succeeds. let driver = LedgerSyncClient::new(client, network.clone(), protocol_name, unified); loop { let peers = recovery_candidate_peers(&*network, &sync_service).await; @@ -111,7 +199,9 @@ pub async fn run_recovery_monitor( } } - // 4. Release the gate: opens both the authoring oracle and the block-import gate. + // Release the gate: opens both the authoring oracle and the block-import gate. Any block + // batches deferred (errored) by the gate during recovery are re-requested by the sync + // restart-retry loop and import cleanly from here on (see `block_import.rs` module docs). gate.mark_ledger_verified(); log::info!(target: LOG_TARGET, "Ledger arena recovered + verified; authoring + import gate released"); } diff --git a/node/src/warp_ledger_sync/oracle.rs b/node/src/warp_ledger_sync/oracle.rs index 70c112b9c..c42808435 100644 --- a/node/src/warp_ledger_sync/oracle.rs +++ b/node/src/warp_ledger_sync/oracle.rs @@ -64,6 +64,14 @@ impl RecoveryGate { self.recovery_pending.load(Ordering::Acquire) && !self.ledger_verified.load(Ordering::Acquire) } + + /// Wait until recovery is no longer in progress (poll-based; recovery takes seconds-to-minutes, + /// so a sub-second poll adds no meaningful latency and keeps the gate free of async machinery). + pub async fn wait_until_released(&self) { + while self.ledger_recovery_in_progress() { + tokio::time::sleep(std::time::Duration::from_millis(250)).await; + } + } } /// Wraps the node's inner [`SyncOracle`] (the `SyncingService`) so AURA reports "still syncing" From 3ea1d4bbb50fbe451be3effafe4ef1cdfb76a354 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Wed, 10 Jun 2026 21:22:55 -0700 Subject: [PATCH 08/13] change file Signed-off-by: Justin Frevert --- changes/added/ledger-warp-sync.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 changes/added/ledger-warp-sync.md diff --git a/changes/added/ledger-warp-sync.md b/changes/added/ledger-warp-sync.md new file mode 100644 index 000000000..dd6fb6d83 --- /dev/null +++ b/changes/added/ledger-warp-sync.md @@ -0,0 +1,6 @@ +# Add Ledger Sync process for Warp Sync + +Adds support for syncing ledger state, while warp syncing. Should enable Substrate warp sync. + +PR: https://github.com/midnightntwrk/midnight-node/pull/1650 +Issue: https://github.com/midnightntwrk/midnight-node/issues/1648 \ No newline at end of file From a9df85c54d5710908228d346e548af317f415a50 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Thu, 11 Jun 2026 20:56:23 -0700 Subject: [PATCH 09/13] docs: remove spec references Signed-off-by: Justin Frevert --- ledger/src/versions/common/mod.rs | 10 +++++----- node/src/warp_ledger_sync/client.rs | 2 +- node/src/warp_ledger_sync/protocol.rs | 12 ++++++------ node/src/warp_ledger_sync/server.rs | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ledger/src/versions/common/mod.rs b/ledger/src/versions/common/mod.rs index 9b14eece0..027e9ebdc 100644 --- a/ledger/src/versions/common/mod.rs +++ b/ledger/src/versions/common/mod.rs @@ -734,11 +734,11 @@ where /// TopoSortedNodes(Ledger DAG)`. /// /// Mirrors the single-pass technique of the toolkit's `serialize_ledger_state_fast`, but roots - /// at `Ledger` (the `Sp` from `get_ledger` is an `Sp`) rather than `LedgerState` — see - /// `warp-ledger-sync-spec.md` ODD-1. Because the blob is rooted at `Ledger`, its recomputed - /// content-address root key equals the on-chain `pallet_midnight::StateKey`, which is exactly - /// what the client verifies against. The tag prefix is **derived** - /// (`GLOBAL_TAG ‖ ::tag()`), never hardcoded (spec §8 format-lockstep). + /// at `Ledger` (the `Sp` from `get_ledger` is an `Sp`) rather than `LedgerState`. + /// Because the blob is rooted at `Ledger`, its recomputed content-address root key equals the + /// on-chain `pallet_midnight::StateKey`, which is exactly what the client verifies against. The + /// tag prefix is **derived** (`GLOBAL_TAG ‖ ::tag()`), never hardcoded, so it + /// stays in lockstep with the ledger serialization format. pub fn serialize_ledger_snapshot(state_key: &[u8]) -> Result, LedgerApiError> { use ledger_storage_local::arena::TopoSortedNodes; use midnight_serialize_local::{GLOBAL_TAG, Serializable}; diff --git a/node/src/warp_ledger_sync/client.rs b/node/src/warp_ledger_sync/client.rs index 371d0868a..8e2384748 100644 --- a/node/src/warp_ledger_sync/client.rs +++ b/node/src/warp_ledger_sync/client.rs @@ -117,7 +117,7 @@ where /// Fetch the full blob from a single peer by paging contiguous byte ranges in order. /// - /// (Parallel / multi-peer range fetch is a permitted optimization — spec ODD-3 — deferred; the + /// (Parallel / multi-peer range fetch is a possible future optimization; the /// `ChunkAssembler` already supports resume by `next_offset`.) async fn fetch_blob_from(&self, peer: PeerId, target: B::Hash) -> Result, ClientError> { // First range establishes `total_len`. diff --git a/node/src/warp_ledger_sync/protocol.rs b/node/src/warp_ledger_sync/protocol.rs index 52893b077..eac01c270 100644 --- a/node/src/warp_ledger_sync/protocol.rs +++ b/node/src/warp_ledger_sync/protocol.rs @@ -17,10 +17,10 @@ //! reassembly logic shared by the server and client. //! //! The transferred payload is the canonical, `Ledger`-rooted arena blob (derived tag prefix ‖ -//! `TopoSortedNodes` of the `Ledger` DAG — see spec ODD-1). Transport pages it by **byte offset** -//! (not by semantic node): the server streams contiguous byte ranges and the client concatenates -//! them in order before deserialize + verify. The children-precede-parents property is intrinsic -//! to the serialized blob, so in-order byte concatenation preserves it automatically (ODD-3). +//! `TopoSortedNodes` of the `Ledger` DAG). Transport pages it by **byte offset** (not by semantic +//! node): the server streams contiguous byte ranges and the client concatenates them in order +//! before deserialize + verify. The children-precede-parents property is intrinsic to the +//! serialized blob, so in-order byte concatenation preserves it automatically. use parity_scale_codec::{Decode, Encode}; @@ -128,8 +128,8 @@ pub enum AssembleError { /// Reassembles ordered, contiguous response chunks into the full canonical blob. /// /// In-order contiguous assembly is sufficient and simplest: a chunk is accepted only if its -/// `offset` equals the bytes received so far. Parallel / multi-peer fetches are allowed (spec -/// ODD-3) but the client must reorder chunks by `offset` before feeding them here. The assembled +/// `offset` equals the bytes received so far. Parallel / multi-peer fetches are allowed but the +/// client must reorder chunks by `offset` before feeding them here. The assembled /// blob is verified against the on-chain `StateKey` by the client driver — this type does no /// crypto, only transport-level reassembly. #[derive(Debug)] diff --git a/node/src/warp_ledger_sync/server.rs b/node/src/warp_ledger_sync/server.rs index 530cffbde..2089c87a4 100644 --- a/node/src/warp_ledger_sync/server.rs +++ b/node/src/warp_ledger_sync/server.rs @@ -129,7 +129,7 @@ where } // Only serve finalized blocks whose state we hold: an unknown hash or a block beyond our - // finalized number is rejected (the warp target is always finalized — spec §7). + // finalized number is rejected (the warp target is always finalized). let header = self.client.header(target)?.ok_or(HandleError::UnknownBlock)?; if *header.number() > self.client.info().finalized_number { return Err(HandleError::NotFinalized); From 333d0a0aa2ad42d6d64fd4f0fd28d84576d14b06 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Fri, 12 Jun 2026 10:48:08 -0700 Subject: [PATCH 10/13] chore: formatting Signed-off-by: Justin Frevert --- node/src/warp_ledger_sync/block_import.rs | 2 +- node/src/warp_ledger_sync/monitor.rs | 33 +++++++++++------------ node/src/warp_ledger_sync/server.rs | 8 +++--- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/node/src/warp_ledger_sync/block_import.rs b/node/src/warp_ledger_sync/block_import.rs index 7c3fd80a2..24957482f 100644 --- a/node/src/warp_ledger_sync/block_import.rs +++ b/node/src/warp_ledger_sync/block_import.rs @@ -123,7 +123,7 @@ where StateAction::Execute => true, StateAction::ExecuteIfPossible => { let parent_hash = *block.header.parent_hash(); - let parent_number = block.header.number().clone().saturating_sub(One::one()); + let parent_number = (*block.header.number()).saturating_sub(One::one()); self.backend.have_state_at(parent_hash, parent_number) }, } diff --git a/node/src/warp_ledger_sync/monitor.rs b/node/src/warp_ledger_sync/monitor.rs index 57ac0f983..ddbf2a49f 100644 --- a/node/src/warp_ledger_sync/monitor.rs +++ b/node/src/warp_ledger_sync/monitor.rs @@ -119,23 +119,22 @@ pub async fn run_recovery_monitor( let (target_hash, target_number) = loop { let status = sync_service.status().await.ok(); - if let Some(status) = &status { - if status.warp_sync.is_some() && !saw_warp { - saw_warp = true; - gate.arm(); - log::info!( - target: LOG_TARGET, - "Warp sync detected; ledger arena recovery armed (authoring gated until verified)" - ); - } + if let Some(status) = &status + && status.warp_sync.is_some() + && !saw_warp + { + saw_warp = true; + gate.arm(); + log::info!( + target: LOG_TARGET, + "Warp sync detected; ledger arena recovery armed (authoring gated until verified)" + ); } if saw_warp { let state_sync_done = status.as_ref().map(|s| s.state_sync.is_none()).unwrap_or(false); - if state_sync_done { - if let Some(target) = client.info().finalized_state { - break target; - } + if state_sync_done && let Some(target) = client.info().finalized_state { + break target; } } else { // Full-sync path: once the node is no longer major-syncing, ledger recovery is never @@ -233,10 +232,10 @@ where peers.extend(reserved); } // Fallback: only if the network layer reported nothing usable. - if peers.is_empty() { - if let Ok(info) = sync_service.peers_info().await { - peers.extend(info.into_iter().map(|(peer, _)| peer)); - } + if peers.is_empty() + && let Ok(info) = sync_service.peers_info().await + { + peers.extend(info.into_iter().map(|(peer, _)| peer)); } peers.into_iter().collect() diff --git a/node/src/warp_ledger_sync/server.rs b/node/src/warp_ledger_sync/server.rs index 2089c87a4..0d86d3887 100644 --- a/node/src/warp_ledger_sync/server.rs +++ b/node/src/warp_ledger_sync/server.rs @@ -122,10 +122,10 @@ where /// Return the serialized `Ledger`-rooted blob for `target`, building and memoizing it on a /// cache miss. Rejects unknown or not-yet-finalized blocks. fn blob_for(&mut self, target: B::Hash) -> Result>, HandleError> { - if let Some((cached, blob)) = &self.cache { - if *cached == target { - return Ok(blob.clone()); - } + if let Some((cached, blob)) = &self.cache + && *cached == target + { + return Ok(blob.clone()); } // Only serve finalized blocks whose state we hold: an unknown hash or a block beyond our From 72b353a0ed4949c0a1c9f6024ce1e7b03c151e73 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Wed, 17 Jun 2026 16:43:47 -0700 Subject: [PATCH 11/13] Only serve ledger arena warp data if non-authority node Signed-off-by: Justin Frevert --- node/src/service.rs | 21 +++++++++++++----- node/src/warp_ledger_sync/server.rs | 34 ++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/node/src/service.rs b/node/src/service.rs index 60dd2e263..2b319929e 100644 --- a/node/src/service.rs +++ b/node/src/service.rs @@ -576,6 +576,11 @@ pub async fn new_full + 'static, Client: HeaderBackend + StorageProvider + Send + Sync + 'static, { - /// Build the handler and the protocol config to register on `net_config` before - /// `build_network`. Spawn [`run`](Self::run) as a task. + /// Build the protocol config to register on `net_config` before `build_network`, plus — when + /// `serve` is true — the handler to spawn via [`run`](Self::run). + /// + /// `serve` gates the **server** side only. Validators pass `serve = false`: serializing the + /// multi-million-node arena is this protocol's most CPU-expensive operation, and it must never + /// compete with a validator's authoring/finality duties (an easy remote DoS vector). A + /// non-serving node advertises no inbound queue, so the network routes no requests to it — but + /// the protocol is still registered, so the node can act as a warp-sync *client* and recover + /// its own arena. Returns `None` for the handler when not serving. pub fn new::Hash>>( genesis_hash: B::Hash, fork_id: Option<&str>, client: Arc, unified: bool, num_peer_hint: usize, - ) -> (Self, N::RequestResponseProtocolConfig) { - // Reserve one in-flight request slot per peer. - let capacity = std::cmp::max(num_peer_hint, 1); - let (tx, request_receiver) = async_channel::bounded(capacity); + serve: bool, + ) -> (Option, N::RequestResponseProtocolConfig) { + // Only advertise an inbound queue (and build a handler) when this node serves. A `None` + // inbound queue means the network layer routes no requests to us; we can still *send* + // requests on the protocol as a warp-sync client. + let (inbound_queue, handler) = if serve { + // Reserve one in-flight request slot per peer. + let capacity = std::cmp::max(num_peer_hint, 1); + let (tx, request_receiver) = async_channel::bounded(capacity); + let handler = + Self { client, unified, request_receiver, cache: None, _phantom: PhantomData }; + (Some(tx), Some(handler)) + } else { + (None, None) + }; let config = N::request_response_config( ledger_sync_protocol_name(genesis_hash, fork_id).into(), @@ -86,10 +104,10 @@ where MAX_REQUEST_SIZE, MAX_RESPONSE_SIZE, REQUEST_TIMEOUT, - Some(tx), + inbound_queue, ); - (Self { client, unified, request_receiver, cache: None, _phantom: PhantomData }, config) + (handler, config) } /// Run the request-handling loop until the inbound queue closes. From 524ac650251585c8522956fbf2a63bdc8afe6c10 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Wed, 17 Jun 2026 18:59:26 -0700 Subject: [PATCH 12/13] Compress/decompress ledger data throughout warp sync Signed-off-by: Justin Frevert --- node/src/warp_ledger_sync/client.rs | 30 ++- .../src/warp_ledger_sync/integration_tests.rs | 38 ++-- node/src/warp_ledger_sync/protocol.rs | 196 ++++++++++++++---- node/src/warp_ledger_sync/server.rs | 46 ++-- 4 files changed, 236 insertions(+), 74 deletions(-) diff --git a/node/src/warp_ledger_sync/client.rs b/node/src/warp_ledger_sync/client.rs index 8e2384748..15425521b 100644 --- a/node/src/warp_ledger_sync/client.rs +++ b/node/src/warp_ledger_sync/client.rs @@ -17,7 +17,8 @@ //! //! After warp + state-sync complete (target block N captured by the monitor), this drives the //! client side of the protocol: read the on-chain `StateKey` at N (from the warp-recovered trie), -//! fetch the `Ledger`-rooted arena blob in byte ranges from peers, then hand the assembled blob to +//! fetch the Snappy-compressed `Ledger`-rooted arena blob in byte ranges from peers, decompress it +//! to the canonical blob, then hand that blob to //! [`midnight_node_ledger::import_verified_ledger_snapshot`], which verifies its root against the //! `StateKey` and persists it on success. //! @@ -35,7 +36,10 @@ use sp_runtime::traits::Block as BlockT; use super::{ LOG_TARGET, - protocol::{ChunkAssembler, LedgerSyncRequest, LedgerSyncResponse, MAX_LEDGER_SYNC_CHUNK}, + protocol::{ + ChunkAssembler, DecompressError, LedgerSyncRequest, LedgerSyncResponse, + MAX_LEDGER_SYNC_CHUNK, decompress_snapshot, validate_snapshot_lengths, + }, read_state_key, }; @@ -115,18 +119,27 @@ where Err(ClientError::AllPeersFailed) } - /// Fetch the full blob from a single peer by paging contiguous byte ranges in order. + /// Fetch the full compressed blob from a single peer by paging contiguous byte ranges in order, + /// then decompress it to the canonical `Ledger`-rooted blob. /// /// (Parallel / multi-peer range fetch is a possible future optimization; the /// `ChunkAssembler` already supports resume by `next_offset`.) async fn fetch_blob_from(&self, peer: PeerId, target: B::Hash) -> Result, ClientError> { - // First range establishes `total_len`. + // First range establishes the compressed transfer length and expected raw size. let first = self.request_range(peer, target, 0).await?; - let mut assembler = ChunkAssembler::new(first.total_len); + let compressed_total_len = first.compressed_total_len; + let raw_total_len = first.raw_total_len; + validate_snapshot_lengths(compressed_total_len, raw_total_len)?; + let mut assembler = ChunkAssembler::new(compressed_total_len); assembler.accept(first.offset, &first.bytes)?; while !assembler.is_complete() { let next = self.request_range(peer, target, assembler.next_offset()).await?; + if next.compressed_total_len != compressed_total_len + || next.raw_total_len != raw_total_len + { + return Err(ClientError::InconsistentResponse); + } if next.bytes.is_empty() { // Server returned an empty range before completion: treat as a truncated transfer. // `into_blob` below will surface `Incomplete`. @@ -135,7 +148,8 @@ where assembler.accept(next.offset, &next.bytes)?; } - Ok(assembler.into_blob()?) + let compressed = assembler.into_blob()?; + Ok(decompress_snapshot(&compressed, raw_total_len)?) } async fn request_range( @@ -176,6 +190,10 @@ pub enum ClientError { Decode(#[from] parity_scale_codec::Error), #[error("chunk assembly failed: {0}")] Assemble(#[from] super::protocol::AssembleError), + #[error("peer changed ledger-sync response metadata between chunks")] + InconsistentResponse, + #[error("failed to decompress ledger snapshot: {0}")] + Decompress(#[from] DecompressError), #[error("all peers failed to provide a verifiable snapshot")] AllPeersFailed, } diff --git a/node/src/warp_ledger_sync/integration_tests.rs b/node/src/warp_ledger_sync/integration_tests.rs index 2ccbda2de..3cbca6123 100644 --- a/node/src/warp_ledger_sync/integration_tests.rs +++ b/node/src/warp_ledger_sync/integration_tests.rs @@ -14,43 +14,48 @@ // limitations under the License. //! In-process round-trip test for the warp ledger-sync core, with no networking: -//! init a real arena from genesis → serialize the `Ledger`-rooted snapshot (server) → page it -//! through the transport chunker + reassembler → import + verify against the on-chain +//! init a real arena from genesis → serialize the `Ledger`-rooted snapshot (server) → compress + +//! page it through the transport chunker + reassembler → decompress → import + verify against the on-chain //! `StateKey` (client/import). Also asserts the security property: a tampered blob is //! rejected (`RootMismatch`), never imported. //! //! Run isolated (it touches the process-global `default_storage` singleton): //! `cargo test -p midnight-node ledger_snapshot_roundtrip`. -use midnight_node_res::networks::{MidnightNetwork, UndeployedNetwork}; +use super::protocol::{ChunkAssembler, build_response, compress_snapshot, decompress_snapshot}; -use super::protocol::{ChunkAssembler, build_response}; - -/// Page `blob` end-to-end the way the client would, with a deliberately small chunk size to force -/// multiple ranges, and return the reassembled bytes. -fn page_and_reassemble(blob: &[u8], chunk: u32) -> Vec { - let mut assembler = ChunkAssembler::new(blob.len() as u64); +/// Compress + page `blob` end-to-end the way the client would, with a deliberately small chunk size +/// to force multiple ranges, and return the decompressed reassembled bytes. +fn compress_page_reassemble_decompress(blob: &[u8], chunk: u32) -> Vec { + let compressed = compress_snapshot(blob).expect("compress snapshot"); + let raw_total_len = blob.len() as u64; + let mut assembler = ChunkAssembler::new(compressed.len() as u64); loop { - let response = build_response(blob, assembler.next_offset(), chunk); + let response = build_response(&compressed, raw_total_len, assembler.next_offset(), chunk); + assert_eq!(response.raw_total_len, raw_total_len); + assert_eq!(response.compressed_total_len, compressed.len() as u64); if response.bytes.is_empty() { break; } assembler.accept(response.offset, &response.bytes).expect("contiguous chunk"); } - assembler.into_blob().expect("complete blob") + let reassembled = assembler.into_blob().expect("complete compressed blob"); + decompress_snapshot(&reassembled, raw_total_len).expect("decompress snapshot") } #[test] fn ledger_snapshot_roundtrip_serialize_chunk_verify_import() { let dir = tempfile::tempdir().expect("tempdir"); - let genesis_state = UndeployedNetwork.genesis_state().to_vec(); + // Use a bundled v13 fixture because the current warp snapshot dispatch table supports + // ledger-state v5/v13/v16; the local undeployed fixture is already v17. + let genesis_state = include_bytes!("../../../res/genesis/genesis_state_preview.mn"); // Initialize the arena from genesis in Separate mode. This sets the process-global // `default_storage`, persists the genesis ledger, and returns the on-chain `StateKey` bytes // (the tagged `TypedArenaKey`) — exactly what `pallet_midnight::StateKey` would hold. - let state_key = midnight_node_ledger::ledger_9::storage::init_storage_paritydb_separate( + let state_key = midnight_node_ledger::ledger_8::storage::init_storage_paritydb_separate( dir.path(), - &genesis_state, + genesis_state, 1024, ); assert!(!state_key.is_empty(), "genesis init must produce a StateKey"); @@ -60,8 +65,9 @@ fn ledger_snapshot_roundtrip_serialize_chunk_verify_import() { .expect("serialize ledger snapshot"); assert!(blob.len() > state_key.len(), "snapshot blob should carry the arena, not just the key"); - // Transport: page into 4 KiB ranges and reassemble; must be byte-identical. - let reassembled = page_and_reassemble(&blob, 4096); + // Transport: compress, page into 4 KiB ranges, reassemble, and decompress; must return the + // canonical blob byte-identically. + let reassembled = compress_page_reassemble_decompress(&blob, 4096); assert_eq!(reassembled, blob, "reassembled blob must be byte-identical to the server's"); // Client/import: verify root == StateKey and persist. Idempotent against the diff --git a/node/src/warp_ledger_sync/protocol.rs b/node/src/warp_ledger_sync/protocol.rs index eac01c270..1d949cb8a 100644 --- a/node/src/warp_ledger_sync/protocol.rs +++ b/node/src/warp_ledger_sync/protocol.rs @@ -16,22 +16,32 @@ //! Ledger-sync protocol message types, codec, naming, and the pure range-serving / //! reassembly logic shared by the server and client. //! -//! The transferred payload is the canonical, `Ledger`-rooted arena blob (derived tag prefix ‖ -//! `TopoSortedNodes` of the `Ledger` DAG). Transport pages it by **byte offset** (not by semantic -//! node): the server streams contiguous byte ranges and the client concatenates them in order -//! before deserialize + verify. The children-precede-parents property is intrinsic to the -//! serialized blob, so in-order byte concatenation preserves it automatically. +//! The transferred payload is the Snappy-compressed canonical, `Ledger`-rooted arena blob (derived +//! tag prefix ‖ `TopoSortedNodes` of the `Ledger` DAG). Transport pages the compressed stream by +//! **byte offset** (not by semantic node): the server streams contiguous byte ranges and the client +//! concatenates them in order, decompresses to the canonical blob, then deserialize + verifies. The +//! children-precede-parents property is intrinsic to the decompressed blob. use parity_scale_codec::{Decode, Encode}; -/// Protocol name suffix; the full name is `/{genesis_hash}[/{fork_id}]/midnight-ledger-sync/1`. -pub const PROTOCOL_NAME_SUFFIX: &str = "midnight-ledger-sync/1"; +/// Protocol name suffix; the full name is `/{genesis_hash}[/{fork_id}]/midnight-ledger-sync/2`. +/// +/// Version 2 range-serves a Snappy-compressed ledger snapshot. Version 1 served raw canonical +/// snapshot bytes. +pub const PROTOCOL_NAME_SUFFIX: &str = "midnight-ledger-sync/2"; /// Maximum number of bytes a single response chunk may carry. The server clamps a peer's /// requested `max_len` to this; the network layer's `max_response_size` must be ≥ this plus codec /// overhead. 1 MiB matches substrate's state-sync chunking. pub const MAX_LEDGER_SYNC_CHUNK: u32 = 1024 * 1024; +/// Maximum decompressed ledger snapshot size accepted from a peer. +/// +/// This is intentionally generous relative to current expected arena sizes, but it prevents a +/// malicious peer from advertising a tiny compressed payload that expands into an unreasonable +/// allocation before ledger-root verification can reject it. +pub const MAX_LEDGER_SYNC_RAW_BYTES: u64 = 1024 * 1024 * 1024; + /// Request a contiguous byte range of the `Ledger`-rooted arena blob at `target_hash`. /// /// `target_hash` must be a finalized block whose state-sync target the server can serve (the @@ -47,17 +57,21 @@ pub struct LedgerSyncRequest { pub max_len: u32, } -/// A contiguous byte range of the canonical `Ledger`-rooted blob. +/// A contiguous byte range of the Snappy-compressed canonical `Ledger`-rooted blob. /// -/// `total_len` is the full blob size (lets the client learn the size up front and drive parallel / -/// resumable range fetches); `offset`/`bytes` are this chunk. +/// `compressed_total_len` is the full compressed stream size (lets the client learn the size up +/// front and drive parallel / resumable range fetches); `raw_total_len` is the expected +/// decompressed canonical blob size, used to bound and validate decompression; `offset`/`bytes` are +/// this compressed chunk. #[derive(Debug, Clone, PartialEq, Eq, Encode, Decode)] pub struct LedgerSyncResponse { - /// Total length of the full canonical blob at the target block. - pub total_len: u64, - /// Byte offset of this chunk within the full blob. + /// Total length of the full compressed blob at the target block. + pub compressed_total_len: u64, + /// Expected total length after Snappy decompression. + pub raw_total_len: u64, + /// Byte offset of this chunk within the compressed blob. pub offset: u64, - /// The chunk bytes: `blob[offset .. offset + bytes.len()]`. + /// The chunk bytes: `compressed_blob[offset .. offset + bytes.len()]`. pub bytes: Vec, } @@ -79,18 +93,107 @@ pub fn clamp_max_len(requested: u32) -> u32 { requested.min(MAX_LEDGER_SYNC_CHUNK) } -/// Build a response chunk for `[offset, offset + clamp(max_len))` of `blob` (server side). +/// Snappy-compress a canonical snapshot blob for transport. +pub fn compress_snapshot(blob: &[u8]) -> Result, snap::Error> { + snap::raw::Encoder::new().compress_vec(blob) +} + +/// Decompress a complete Snappy-compressed snapshot blob after range reassembly. +pub fn decompress_snapshot( + compressed: &[u8], + expected_raw_len: u64, +) -> Result, DecompressError> { + validate_snapshot_lengths(compressed.len() as u64, expected_raw_len)?; + let advertised = snap::raw::decompress_len(compressed).map_err(DecompressError::Snap)? as u64; + if advertised != expected_raw_len { + return Err(DecompressError::LengthMismatch { + expected: expected_raw_len, + actual: advertised, + }); + } + let bytes = snap::raw::Decoder::new() + .decompress_vec(compressed) + .map_err(DecompressError::Snap)?; + if bytes.len() as u64 != expected_raw_len { + return Err(DecompressError::LengthMismatch { + expected: expected_raw_len, + actual: bytes.len() as u64, + }); + } + Ok(bytes) +} + +/// Validate advertised compressed/raw snapshot lengths before allocating or downloading the full +/// compressed stream. +pub fn validate_snapshot_lengths( + compressed_total_len: u64, + raw_total_len: u64, +) -> Result<(), DecompressError> { + if raw_total_len > MAX_LEDGER_SYNC_RAW_BYTES { + return Err(DecompressError::TooLarge { len: raw_total_len }); + } + let max_compressed_len = snap::raw::max_compress_len(raw_total_len as usize) as u64; + if compressed_total_len > max_compressed_len { + return Err(DecompressError::CompressedTooLarge { + len: compressed_total_len, + max: max_compressed_len, + }); + } + Ok(()) +} + +/// Errors from decompressing a complete compressed snapshot. +#[derive(Debug, thiserror::Error)] +pub enum DecompressError { + /// Peer advertised a decompressed length larger than this protocol accepts. + #[error("decompressed snapshot length {len} exceeds limit {MAX_LEDGER_SYNC_RAW_BYTES}")] + TooLarge { + /// Advertised decompressed size. + len: u64, + }, + /// Peer advertised a compressed length larger than Snappy can emit for the raw length. + #[error("compressed snapshot length {len} exceeds snappy limit {max}")] + CompressedTooLarge { + /// Advertised compressed size. + len: u64, + /// Maximum possible Snappy raw compressed size for the advertised raw size. + max: u64, + }, + /// The response metadata and Snappy stream header disagreed on decompressed length. + #[error("decompressed snapshot length mismatch: expected {expected}, got {actual}")] + LengthMismatch { + /// Expected decompressed size from response metadata. + expected: u64, + /// Actual decompressed size from the Snappy header/result. + actual: u64, + }, + /// Snappy rejected the compressed stream. + #[error("snappy decompression failed: {0}")] + Snap(#[from] snap::Error), +} + +/// Build a response chunk for `[offset, offset + clamp(max_len))` of `compressed_blob` (server side). /// /// Clamps `max_len`, never reads past the end of the blob, and yields an empty chunk if `offset` /// is at or past the end (which signals completion to the client). -pub fn build_response(blob: &[u8], offset: u64, max_len: u32) -> LedgerSyncResponse { - let total_len = blob.len() as u64; - let start = offset.min(total_len); - let avail = total_len - start; +pub fn build_response( + compressed_blob: &[u8], + raw_total_len: u64, + offset: u64, + max_len: u32, +) -> LedgerSyncResponse { + let compressed_total_len = compressed_blob.len() as u64; + let start = offset.min(compressed_total_len); + let avail = compressed_total_len - start; let len = (clamp_max_len(max_len) as u64).min(avail); let start = start as usize; let end = start + len as usize; - LedgerSyncResponse { total_len, offset, bytes: blob[start..end].to_vec() } + LedgerSyncResponse { + compressed_total_len, + raw_total_len, + offset, + bytes: compressed_blob[start..end].to_vec(), + } } /// Errors from reassembling response chunks into the full blob (client side). @@ -105,7 +208,7 @@ pub enum AssembleError { /// The offset the chunk actually carried. got: u64, }, - /// A chunk would extend the blob past the advertised `total_len`. + /// A chunk would extend the blob past the advertised compressed `total_len`. #[error("chunk overflows total_len {total}: offset {offset} + len {len}")] Overflow { /// Advertised total length of the blob. @@ -125,13 +228,13 @@ pub enum AssembleError { }, } -/// Reassembles ordered, contiguous response chunks into the full canonical blob. +/// Reassembles ordered, contiguous response chunks into the full compressed blob. /// /// In-order contiguous assembly is sufficient and simplest: a chunk is accepted only if its /// `offset` equals the bytes received so far. Parallel / multi-peer fetches are allowed but the /// client must reorder chunks by `offset` before feeding them here. The assembled -/// blob is verified against the on-chain `StateKey` by the client driver — this type does no -/// crypto, only transport-level reassembly. +/// decompressed blob is verified against the on-chain `StateKey` by the client driver — this type +/// does no crypto, only transport-level reassembly. #[derive(Debug)] pub struct ChunkAssembler { total_len: u64, @@ -200,8 +303,12 @@ mod tests { let decoded = LedgerSyncRequest::::decode(&mut &req.encode()[..]).unwrap(); assert_eq!(req, decoded); - let resp = - LedgerSyncResponse { total_len: 9_999, offset: 4096, bytes: vec![1, 2, 3, 4, 5] }; + let resp = LedgerSyncResponse { + compressed_total_len: 9_999, + raw_total_len: 20_000, + offset: 4096, + bytes: vec![1, 2, 3, 4, 5], + }; let decoded = LedgerSyncResponse::decode(&mut &resp.encode()[..]).unwrap(); assert_eq!(resp, decoded); } @@ -212,11 +319,11 @@ mod tests { let hex = hex::encode(genesis.as_ref()); assert_eq!( ledger_sync_protocol_name(genesis, None), - format!("/{hex}/midnight-ledger-sync/1") + format!("/{hex}/midnight-ledger-sync/2") ); assert_eq!( ledger_sync_protocol_name(genesis, Some("forkz")), - format!("/{hex}/forkz/midnight-ledger-sync/1") + format!("/{hex}/forkz/midnight-ledger-sync/2") ); } @@ -230,22 +337,24 @@ mod tests { #[test] fn build_response_clamps_and_bounds() { let blob: Vec = (0..=255u8).cycle().take(5000).collect(); + let raw_total_len = 10_000; // A normal interior range. - let r = build_response(&blob, 1000, 500); - assert_eq!(r.total_len, 5000); + let r = build_response(&blob, raw_total_len, 1000, 500); + assert_eq!(r.compressed_total_len, 5000); + assert_eq!(r.raw_total_len, raw_total_len); assert_eq!(r.offset, 1000); assert_eq!(r.bytes, &blob[1000..1500]); // max_len past the end is truncated to the tail. - let r = build_response(&blob, 4800, 1000); + let r = build_response(&blob, raw_total_len, 4800, 1000); assert_eq!(r.bytes, &blob[4800..5000]); // offset at/past the end yields an empty chunk (completion signal). - let r = build_response(&blob, 5000, 100); + let r = build_response(&blob, raw_total_len, 5000, 100); assert!(r.bytes.is_empty()); - assert_eq!(r.total_len, 5000); - let r = build_response(&blob, 9999, 100); + assert_eq!(r.compressed_total_len, 5000); + let r = build_response(&blob, raw_total_len, 9999, 100); assert!(r.bytes.is_empty()); } @@ -257,7 +366,7 @@ mod tests { // `next_offset`, fed into the assembler in order. let mut asm = ChunkAssembler::new(blob.len() as u64); loop { - let chunk = build_response(&blob, asm.next_offset(), 700); + let chunk = build_response(&blob, 10_000, asm.next_offset(), 700); if chunk.bytes.is_empty() { break; } @@ -290,4 +399,21 @@ mod tests { asm.accept(0, &[0u8; 8]).unwrap(); assert_eq!(asm.into_blob(), Err(AssembleError::Incomplete { have: 8, total: 16 })); } + + #[test] + fn snapshot_compression_roundtrip() { + let raw: Vec = (0..=255u8).cycle().take(16_384).collect(); + let compressed = compress_snapshot(&raw).expect("compress"); + let decompressed = decompress_snapshot(&compressed, raw.len() as u64).expect("decompress"); + assert_eq!(decompressed, raw); + } + + #[test] + fn decompression_rejects_length_mismatch() { + let raw = vec![42u8; 128]; + let compressed = compress_snapshot(&raw).expect("compress"); + let err = decompress_snapshot(&compressed, raw.len() as u64 + 1) + .expect_err("wrong expected length must fail"); + assert!(matches!(err, DecompressError::LengthMismatch { .. })); + } } diff --git a/node/src/warp_ledger_sync/server.rs b/node/src/warp_ledger_sync/server.rs index 0fa517535..e8ecb0c8e 100644 --- a/node/src/warp_ledger_sync/server.rs +++ b/node/src/warp_ledger_sync/server.rs @@ -16,8 +16,8 @@ //! Ledger-sync server handler. //! //! Answers [`LedgerSyncRequest`]s from warp-syncing peers by serializing this (fully synced) node's -//! `Ledger`-rooted arena snapshot at the requested finalized block and serving the requested byte -//! range. Patterned on substrate's `state_request_handler.rs`. +//! `Ledger`-rooted arena snapshot at the requested finalized block, Snappy-compressing it, and +//! serving the requested compressed byte range. Patterned on substrate's `state_request_handler.rs`. //! //! Verification is the *client's* job: the server is untrusted, so it performs no crypto — //! it only serves bytes whose recomputed root the client checks against the on-chain `StateKey`. @@ -36,7 +36,7 @@ use sp_runtime::traits::{Block as BlockT, Header as HeaderT}; use super::{ LOG_TARGET, - protocol::{LedgerSyncRequest, build_response, ledger_sync_protocol_name}, + protocol::{LedgerSyncRequest, build_response, compress_snapshot, ledger_sync_protocol_name}, read_state_key, }; @@ -56,11 +56,17 @@ pub struct LedgerSyncRequestHandler { /// serializer dispatches to — see [`midnight_node_ledger::serialize_ledger_snapshot`]). unified: bool, request_receiver: async_channel::Receiver, - /// `(target_block, serialized blob)` memo for the last block served. - cache: Option<(B::Hash, Arc>)>, + /// `(target_block, compressed serialized blob)` memo for the last block served. + cache: Option<(B::Hash, CachedSnapshot)>, _phantom: PhantomData, } +#[derive(Clone)] +struct CachedSnapshot { + compressed_blob: Arc>, + raw_len: u64, +} + impl LedgerSyncRequestHandler where B: BlockT, @@ -133,17 +139,18 @@ where fn handle_request(&mut self, payload: &[u8]) -> Result, HandleError> { let req = LedgerSyncRequest::::decode(&mut &payload[..])?; - let blob = self.blob_for(req.target_hash)?; - Ok(build_response(&blob, req.offset, req.max_len).encode()) + let snapshot = self.blob_for(req.target_hash)?; + Ok(build_response(&snapshot.compressed_blob, snapshot.raw_len, req.offset, req.max_len) + .encode()) } - /// Return the serialized `Ledger`-rooted blob for `target`, building and memoizing it on a - /// cache miss. Rejects unknown or not-yet-finalized blocks. - fn blob_for(&mut self, target: B::Hash) -> Result>, HandleError> { - if let Some((cached, blob)) = &self.cache + /// Return the compressed serialized `Ledger`-rooted blob for `target`, building and memoizing it + /// on a cache miss. Rejects unknown or not-yet-finalized blocks. + fn blob_for(&mut self, target: B::Hash) -> Result { + if let Some((cached, snapshot)) = &self.cache && *cached == target { - return Ok(blob.clone()); + return Ok(snapshot.clone()); } // Only serve finalized blocks whose state we hold: an unknown hash or a block beyond our @@ -159,15 +166,18 @@ where let blob = midnight_node_ledger::serialize_ledger_snapshot(self.unified, &state_key) .map_err(HandleError::Serialize)?; + let raw_len = blob.len() as u64; + let compressed_blob = compress_snapshot(&blob).map_err(HandleError::Compress)?; log::debug!( target: LOG_TARGET, - "Serialized ledger snapshot for {target:?}: {} bytes", - blob.len() + "Serialized ledger snapshot for {target:?}: {} bytes raw, {} bytes compressed", + raw_len, + compressed_blob.len() ); - let blob = Arc::new(blob); - self.cache = Some((target, blob.clone())); - Ok(blob) + let snapshot = CachedSnapshot { compressed_blob: Arc::new(compressed_blob), raw_len }; + self.cache = Some((target, snapshot.clone())); + Ok(snapshot) } } @@ -185,4 +195,6 @@ enum HandleError { NoStateKey, #[error("failed to serialize ledger snapshot: {0}")] Serialize(String), + #[error("failed to compress ledger snapshot: {0}")] + Compress(snap::Error), } From 13cbd06164db105049df04688cd33a453616ce44 Mon Sep 17 00:00:00 2001 From: Justin Frevert Date: Thu, 18 Jun 2026 18:26:53 -0700 Subject: [PATCH 13/13] compatibility table update for ledger 9 Signed-off-by: Justin Frevert --- ledger/src/lib.rs | 15 ++++++++++----- node/src/backend/custom_parity_db.rs | 4 ++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/ledger/src/lib.rs b/ledger/src/lib.rs index 291105424..a35b4ddc3 100644 --- a/ledger/src/lib.rs +++ b/ledger/src/lib.rs @@ -156,7 +156,8 @@ pub fn drop_all_default_storage() { /// Parse the `vNN` from a `ledger-state[vNN]` tag embedded in a tagged blob (a `StateKey` or a /// genesis_state). Used to dispatch warp serialize/import (and genesis-init) to the ledger module -/// whose `LedgerState` serialization matches: **v5 → `ledger_7`, v13 → `ledger_8`, v16 → `ledger_9`**. +/// whose `LedgerState` serialization matches: **v5 → `ledger_7`, v13 → `ledger_8`, +/// v16/v17 → `ledger_9`**. /// A warp-syncing node can target a chain governed by an *older* ledger version than this build's /// latest (e.g. a real devnet whose arena is still v13), so the version is read from the data, not /// assumed to be the tip's. @@ -200,8 +201,10 @@ macro_rules! bridge_arena_call { #[cfg(feature = "std")] pub fn serialize_ledger_snapshot(unified: bool, state_key: &[u8]) -> Result, String> { match ledger_state_tag_version(state_key) { - Some(16) => bridge_arena_call!(ledger_9, unified, serialize_ledger_snapshot(state_key)) - .map_err(|e| format!("{e:?}")), + Some(16 | 17) => { + bridge_arena_call!(ledger_9, unified, serialize_ledger_snapshot(state_key)) + .map_err(|e| format!("{e:?}")) + }, Some(13) => bridge_arena_call!(ledger_8, unified, serialize_ledger_snapshot(state_key)) .map_err(|e| format!("{e:?}")), Some(5) => bridge_arena_call!(ledger_7, unified, serialize_ledger_snapshot(state_key)) @@ -221,7 +224,9 @@ pub fn serialize_ledger_snapshot(unified: bool, state_key: &[u8]) -> Result bool { match ledger_state_tag_version(state_key) { - Some(16) => bridge_arena_call!(ledger_9, unified, get_ledger_state_root(state_key)).is_ok(), + Some(16 | 17) => { + bridge_arena_call!(ledger_9, unified, get_ledger_state_root(state_key)).is_ok() + }, Some(13) => bridge_arena_call!(ledger_8, unified, get_ledger_state_root(state_key)).is_ok(), Some(5) => bridge_arena_call!(ledger_7, unified, get_ledger_state_root(state_key)).is_ok(), _ => false, @@ -280,7 +285,7 @@ pub fn import_verified_ledger_snapshot( // Dispatch on the `StateKey`'s ledger-state version (the underlying method returns the shared // `SnapshotImportError` for every version, so no error mapping is needed). match ledger_state_tag_version(expected_state_key) { - Some(16) => { + Some(16 | 17) => { bridge_arena_call!( ledger_9, unified, diff --git a/node/src/backend/custom_parity_db.rs b/node/src/backend/custom_parity_db.rs index 08a628c1b..7bd82b25e 100644 --- a/node/src/backend/custom_parity_db.rs +++ b/node/src/backend/custom_parity_db.rs @@ -120,7 +120,7 @@ pub fn open>( let dir = &storage_config.db_path; let cache = storage_config.cache_size; match genesis_version { - Some(16) => { + Some(16 | 17) => { midnight_node_ledger::ledger_9::storage::init_storage_paritydb_separate( dir, genesis_state, @@ -148,7 +148,7 @@ pub fn open>( StorageSeparation::Unified => { let cache = storage_config.cache_size; match genesis_version { - Some(16) => { + Some(16 | 17) => { midnight_node_ledger::ledger_9::storage::init_storage_paritydb_unified::< _, NUM_COLUMNS_POLKADOT,