From ed4e8331dafac7161cc87417778dac1f5dedc704 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Jun 2026 05:59:48 +0000 Subject: [PATCH 1/2] =?UTF-8?q?M8=20H1:=20owner-runnable=20power-pull=20ri?= =?UTF-8?q?g=20(=C2=A714.8=20H1=20/=20D1=20/=20#18)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build the H1 power-pull automation — the last open M8 gate and the only one that fundamentally needs physical hardware (a real mains cut on storage that genuinely loses un-synced data, >=50 consecutive PASS with zero acked-LSN loss, D1). No src/ change: this is orchestration + CI + docs around the proven power_pull_workload/power_pull_verify bins, storage-check.sh, and evidence.sh. - scripts/m8/h1-cycle.sh: deploy/calibrate/cycle/run/config. The §3.4 vacuous-pass calibration GATE runs first (un-synced marker must be GONE after a real cut, else abort loudly — no cycle counts). Cycle loop drives the target over ssh, cuts via a pluggable smart-plug local API, restores, and verifies every acked LSN survived. INCONCLUSIVE never counts; a FAIL stops the run; verdict=PASS only when h2_probe proved loss AND fail==0. Emits the §5 ledger. - Smart-plug driver: shelly (Gen2/Gen3/Plus RPC — default, the Shelly Plug S Gen3; aliases shelly-gen2/gen3), shelly-gen1, tasmota. H1_PLUG_DRY_RUN for no-hardware dry runs. - .github/workflows/m8-h1.yml: workflow_dispatch-only, runs-on [self-hosted, h1-rig]; cross-compiles aarch64 bins, deploys, runs the calibration + cycle loop, uploads the §5 evidence artifact, posts to #18 (dispatch-gated sign-off). Loud-skips (OPEN, not green) if the rig is unwired or unreachable. - .github/actionlint.yaml: declare the h1-rig custom runner label. - docs: runbook H1 "Automated rig" + "Rig setup" (Pi 3 read-only overlay, dedicated DUT partition, BeagleBone-eMMC + USB-SSD media, controller wiring, smart-plug table); infra-plan §3.2/§3.4 corrected to storage-check.sh probe-write/probe-verify (no separate storage_probe binary); README + CLAUDE.md status. H1 stays OPEN-pending-owner-run: the owner triggers m8-h1.yml on the wired rig, observes >=50 PASS with the H2 probe proven + evidence on #18, and closes #18. The agent never self-certifies H1. Verified: shellcheck + actionlint clean, bash -n, plug-URL/config/evidence dry-runs, the abort path emits a valid ABORTED ledger, cargo build of the bins. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01VW9DW3Lu7dVmargSY1gbZk --- .github/actionlint.yaml | 7 + .github/workflows/m8-h1.yml | 149 +++++++++++++++ CLAUDE.md | 3 +- docs/m8-infra-plan.md | 9 +- docs/m8-runbook.md | 78 +++++++- scripts/README.md | 14 +- scripts/m8/h1-cycle.sh | 365 ++++++++++++++++++++++++++++++++++++ 7 files changed, 617 insertions(+), 8 deletions(-) create mode 100644 .github/actionlint.yaml create mode 100644 .github/workflows/m8-h1.yml create mode 100644 scripts/m8/h1-cycle.sh diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml new file mode 100644 index 0000000..29b2842 --- /dev/null +++ b/.github/actionlint.yaml @@ -0,0 +1,7 @@ +# actionlint config — declare custom self-hosted runner labels so the H1 power-pull +# gate (.github/workflows/m8-h1.yml, runs-on: [self-hosted, h1-rig]) lints clean. +# The owner's CONTROLLER laptop (never cut) registers a self-hosted runner with the +# "h1-rig" label; see docs/m8-runbook.md → controller setup. +self-hosted-runner: + labels: + - h1-rig diff --git a/.github/workflows/m8-h1.yml b/.github/workflows/m8-h1.yml new file mode 100644 index 0000000..720dfa8 --- /dev/null +++ b/.github/workflows/m8-h1.yml @@ -0,0 +1,149 @@ +name: M8 H1 power-pull (owner-run hardware gate) + +# §14.8 H1 — the ONLY true durability gate (D1): committed records survive a REAL +# mains power cut, ≥50 consecutive cycles with zero acked-LSN loss. It cannot be +# self-certified in a sandbox — it needs a wired rig with a cuttable target and a +# smart-plug mains interrupt. This workflow drives the owner's rig end-to-end and +# emits the §5 evidence ledger; the OWNER signs off and closes #18. +# +# HONESTY (M8 ground rules): +# * workflow_dispatch ONLY (human-initiated; a power-cut campaign is never automatic). +# * Runs on the owner's CONTROLLER laptop (self-hosted, never cut) — labelled +# [self-hosted, h1-rig]. The target (Raspberry Pi 3 / BeagleBone / USB-SSD DUT) is +# cut via the smart plug; the controller is not. +# * BEST-EFFORT + LOUD SKIP: if the rig isn't configured/reachable, the job emits a +# loud ::warning:: and stays OPEN-pending-owner-run — it never fakes green. +# * The §3.4 calibration is the FIRST step of every campaign (inside h1-cycle.sh run): +# a vacuous DUT (un-synced marker survives the cut) ABORTS before any cycle counts. +# * A D1 FAIL or an aborted calibration REDS the build. INCONCLUSIVE never counts +# toward the 50 (h1-cycle.sh enforces all of this). +# * Evidence ledger (§5) uploaded every run; posted to #18 only on this manual +# dispatch (the human sign-off trail). + +on: + workflow_dispatch: + inputs: + dut_medium: + description: "DUT medium under test (recorded in the evidence ledger)" + type: choice + options: + - microSD + - USB-SSD + - eMMC(BeagleBone) + default: microSD + plug_type: + description: "Smart-plug local API (shelly = Gen2/Gen3/Plus RPC)" + type: choice + options: + - shelly + - shelly-gen1 + - tasmota + default: shelly + cycles: + description: "Required CONSECUTIVE PASS cycles" + type: string + default: "50" + +permissions: + contents: read + issues: write + +env: + CARGO_TERM_COLOR: always + M8_EV: ${{ github.workspace }}/m8-evidence + # Rig config comes from repo Variables (Settings → Secrets and variables → Actions → + # Variables). Empty ⇒ the rig isn't wired ⇒ loud-skip. The runner host must have + # passwordless ssh (key auth) to H1_TARGET_SSH and reachability to the smart plug. + H1_TARGET_SSH: ${{ vars.H1_TARGET_SSH }} + H1_WAL_DIR: ${{ vars.H1_WAL_DIR }} + H1_CONTROLLER_IP: ${{ vars.H1_CONTROLLER_IP }} + H1_BIN_DIR: ${{ vars.H1_BIN_DIR }} + H1_PLUG_IP: ${{ vars.H1_PLUG_IP }} + H1_PLUG_ID: ${{ vars.H1_PLUG_ID }} + H1_PLUG_TYPE: ${{ inputs.plug_type }} + H1_DUT_MEDIUM: ${{ inputs.dut_medium }} + H1_CYCLES: ${{ inputs.cycles }} + WAL_M8_EVIDENCE: ${{ github.workspace }}/m8-evidence/evidence-h1.json + +jobs: + h1: + name: H1 power-pull (≥50 cycles, zero acked loss — D1) + runs-on: [self-hosted, h1-rig] + # 50 cycles × (~commit window + off + boot + verify) plus INCONCLUSIVE re-runs. + timeout-minutes: 240 + steps: + - uses: actions/checkout@v4 + - name: Install Rust toolchain (+ aarch64 target) + uses: dtolnay/rust-toolchain@stable + with: + targets: aarch64-unknown-linux-gnu + + # Rig availability: required config present AND the target reachable over ssh. + # Absent ⇒ loud OPEN skip (NOT a pass), mirroring the dm-flakey loud-skip. + - name: H1 rig availability + id: rig + run: | + missing= + for v in H1_TARGET_SSH H1_WAL_DIR H1_CONTROLLER_IP H1_PLUG_IP; do + [ -n "${!v:-}" ] || missing="$missing $v" + done + if [ -n "$missing" ]; then + echo "::warning title=H1 rig not configured::missing repo Variables:$missing — H1 stays OPEN-pending-owner-run (NOT a pass). Set them under Actions → Variables and wire the rig (docs/m8-runbook.md)." + echo "available=false" >> "$GITHUB_OUTPUT"; exit 0 + fi + if ! ssh -o BatchMode=yes -o ConnectTimeout=10 -o StrictHostKeyChecking=accept-new "$H1_TARGET_SSH" true 2>/dev/null; then + echo "::warning title=H1 target unreachable::$H1_TARGET_SSH did not answer ssh — H1 stays OPEN (NOT a pass). Check wiring / passwordless key auth on the runner host." + echo "available=false" >> "$GITHUB_OUTPUT"; exit 0 + fi + echo "available=true" >> "$GITHUB_OUTPUT" + + - name: Cross-compile ARM workload/verify bins + if: steps.rig.outputs.available == 'true' + env: + CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER: aarch64-linux-gnu-gcc + run: | + if ! command -v aarch64-linux-gnu-gcc >/dev/null 2>&1; then + echo "::error title=cross linker missing::aarch64-linux-gnu-gcc not on the runner. Provision it (Debian/Ubuntu: 'sudo apt-get install -y gcc-aarch64-linux-gnu') or use 'cross'. See docs/m8-runbook.md → controller setup." + exit 1 + fi + cargo build --release --target aarch64-unknown-linux-gnu \ + --bin power_pull_workload --bin power_pull_verify + + - name: Deploy bins + storage-check.sh to the target + if: steps.rig.outputs.available == 'true' + run: scripts/m8/h1-cycle.sh deploy + + # The FULL campaign: §3.4 calibration GATE (abort if vacuous) → the ≥N-consecutive + # PASS cycle loop → emit the §5 ledger. h1-cycle.sh enforces every honesty rail. + # rc 0 = N consecutive PASS with H2 proven; anything else = vacuous calibration, + # a D1 FAIL, or an infra abort → red build (NOT a pass). + - name: H1 calibration + cycle loop + if: steps.rig.outputs.available == 'true' + run: | + set +e; scripts/m8/h1-cycle.sh run; rc=$?; set -e + case "$rc" in + 0) echo "H1: ${H1_CYCLES} consecutive PASS with the §3.4 H2 probe proven (medium: ${H1_DUT_MEDIUM}).";; + *) echo "::error title=H1 FAIL/ABORT::h1-cycle.sh run exited ${rc} — read the evidence ledger: vacuous calibration (marker survived), a D1 FAIL (acked LSN lost — most likely a lying device, §3.6), or an infra abort. NOT a pass."; exit 1;; + esac + + - name: Upload evidence ledger + if: always() + uses: actions/upload-artifact@v4 + with: + name: m8-h1-evidence-${{ github.run_id }} + path: ${{ github.workspace }}/m8-evidence/** + if-no-files-found: ignore + + # SIGN-OFF TRAIL: post the §5 ledger to #18 on this manual dispatch. The agent + # never self-certifies H1 — this is the OWNER's evidence; the OWNER closes #18. + - name: Post evidence to #18 (dispatch sign-off only) + if: always() && github.event_name == 'workflow_dispatch' && steps.rig.outputs.available == 'true' + env: + GH_TOKEN: ${{ github.token }} + run: | + file="${{ github.workspace }}/m8-evidence/evidence-h1.json" + [ -e "$file" ] || { echo "no evidence at $file; skipping #18"; exit 0; } + { printf '**H1 power-pull** — automated M8 evidence (workflow_dispatch sign-off run #%s).\n\n' "$GITHUB_RUN_ID" + printf 'The agent never self-certifies H1; this is the OWNER sign-off trail. A PASS verdict is legitimate only when h2_probe is "PASS(marker gone)" and fail is 0. The OWNER reviews and closes #18.\n\n' + printf '```json\n'; cat "$file"; printf '\n```\n'; } > body.md + gh issue comment 18 --repo "$GITHUB_REPOSITORY" --body-file body.md diff --git a/CLAUDE.md b/CLAUDE.md index 86b40f2..d928113 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -109,7 +109,8 @@ The entire value of this component is **correct behavior under crashes and fault ## Project status (keep this updated) -- **LATEST (2026-06-25, PRs #20 + #21 off `main`): dm-flakey CI now RUNS, H3-physical PASSES, §14.4d is three-tier.** PR #20 (`claude/m8-dmflakey-ci-fixes`) fixes the hosted dm-flakey gate: provision `linux-modules-extra-$(uname -r)` + `modprobe dm_flakey` (dm-flakey **is** reachable on hosted Azure runners — no self-hosted runner needed), `cmd_check` queries `dmsetup targets` **as root**, and dm table reloads use `dmsetup suspend --noflush --nolockfs` in **both** `flakey_fault` and `flakey_up` (a default suspend's lockfs **freeze** is a full fs-sync that either EIO'd through the erroring target — misread as a §12 violation — or persisted the un-synced data before the drop, defeating the §14.4d controls). **Result: H3-physical ext4 PASSES** (source-confirmed block-layer EIO → §12 poison; evidence on issue #16). PR #21 (`claude/m8-dirfsync-tiers`, stacked on #20) resolves §14.4d per the designer: **the dir-fsync omission is NOT reproducible on ext4/xfs/btrfs** — those journaling FSes transitively persist a new file's dir entry on the segment's own `fsync` (AFSNCE OSDI '14, §18), masking it; `fsync_dir` is kept as a portable-durability safeguard. Three tiers: **Tier-1 (PRIMARY, per-PR, deterministic) = `scripts/m8/dirfsync-presence.sh`** straces the roll path, asserts correct issues the roll-time dir-`fsync` while `inject_no_dir_fsync` does not — **RUN+green here** (`correct=5` vs `inject=1`), wired into `ci.yml`; **Tier-2 = behavioral power-loss via a synchronized mid-run cut** (`src/bin/dirfsync_cut_workload.rs` rolls once, acks a record into the new segment, blocks with the dirent dirty; harness activates `drop_writes` *before* kill/umount, fsck, remount, verify) — **CLOSED as a DOCUMENTED NEGATIVE RESULT (PR #21, owner Fedora 43):** the inject build recovers fully on EVERY config tested — ext4/xfs/btrfs, journal-less ext4 (incl. `ext2`-format), and the last attempt, journaled ext4 `data=writeback` (the ext4 driver's weakest ordering; `data=writeback` weakens data ordering, not the metadata/dirent). The dirent reaches disk transitively via the file's own `fdatasync` everywhere. **Mechanism correction:** the earlier "ext2 block-adjacency" claim is RETRACTED — dmesg shows `ext2`-format is serviced by the **ext4 driver journal-less** on modern kernels (standalone ext2 driver removed in Linux 6.9); mechanism not isolated. No readily-available Linux FS exposes it behaviorally ⇒ honest negative result, not a gap. Tier-1 strace carries the DoD; `fsync_dir` retained as a POSIX-portability safeguard. (Note: `data=writeback` requires a journal — NOT combinable with `-O ^has_journal`.); **Tier-3 = ext4/xfs/btrfs INCONCLUSIVE-by-design** (informational, never red on a masked miss, still red on a correct-build data loss). dm-flakey harness also got `wipefs`/zero-before-mkfs + `udevadm settle` + `dmsetup remove --retry/-f --deferred` (fixes the back-to-back "device busy"). Docs corrected (design §14.4d note + §14.13 row, runbook three-tier, this block). `shellcheck`+`cargo fmt --check` clean; the strace gate is self-verified green. **§14.4d behavioral (Tier-2) is now CLOSED as a documented negative result** (Tier-1 satisfies the DoD). **Still owner/CI to observe:** H1 power-pull. +- **LATEST (2026-06-26, branch `claude/gifted-pasteur-gfr9th`): the M8 H1 power-pull RIG is BUILT (owner-runnable; the last open M8 gate).** H1 is the only M8 gate that fundamentally needs physical hardware (a real mains cut on storage that genuinely loses un-synced data, ≥50 consecutive PASS, zero acked-LSN loss = D1). This adds **no `src/` code** — only orchestration + CI + docs around the proven `power_pull_workload`/`power_pull_verify` bins, `storage-check.sh`, and `evidence.sh`. New **`scripts/m8/h1-cycle.sh`** (subcommands `deploy`/`calibrate`/`cycle`/`run`/`config`): cross-built ARM bins + `storage-check.sh` are `scp`'d to the target; the **§3.4 vacuous-pass calibration GATE runs FIRST** (writes an un-synced marker, cuts via the smart plug, asserts the marker is GONE — a *survived* marker ABORTS loudly, no cycle counts); then the cycle loop (fresh WAL → off-box collector → workload over ssh → **mains CUT** → restore → boot-wait → `power_pull_verify` over ssh) repeats until **N consecutive PASS**, with **INCONCLUSIVE never counted** (resets the streak), a **FAIL stopping the run**, and `verdict=PASS` emitted only when `h2_probe` proved loss AND `fail==0`. **Pluggable smart-plug driver** — `H1_PLUG_TYPE`: `shelly` (Gen2/Gen3/Plus RPC `/rpc/Switch.Set?id=&on=`, the **default; the owner's Shelly Plug S Gen3**, aliases `shelly-gen2`/`shelly-gen3`), `shelly-gen1` (`/relay/0?turn=`), `tasmota` (`/cm?cmnd=Power%20`); `H1_PLUG_DRY_RUN=1` for no-hardware dry runs. New **`.github/workflows/m8-h1.yml`** — `workflow_dispatch`-ONLY, `runs-on: [self-hosted, h1-rig]` (the owner's never-cut controller laptop), cross-compiles `aarch64-unknown-linux-gnu`, `deploy`→`run`, uploads the §5 evidence artifact, and posts the ledger to **#18** (dispatch-gated sign-off, matching the dm-flakey/macOS pattern); **loud-skips** (OPEN, not green) if the rig Variables are unset or the target is unreachable. Added **`.github/actionlint.yaml`** declaring the `h1-rig` custom runner label. Docs: runbook H1 "Automated rig" + "Rig setup (target/controller)" subsections (Pi 3 read-only overlay rootfs, dedicated DUT partition, BeagleBone-eMMC + USB-SSD media, controller wiring, smart-plug table); infra-plan §3.2/§3.4 corrected (the marker probe is `storage-check.sh probe-write/probe-verify`, **not** a `storage_probe` binary — honors the no-`src/`-change rule); `scripts/README.md` `h1-cycle.sh` row + owner-run examples. **Verified here:** `shellcheck scripts/m8/h1-cycle.sh` clean, `actionlint` clean on all workflows, `bash -n`, plug-URL + config + §5-evidence dry-runs, `cargo build` of the two bins. **CANNOT run here** (no cuttable target / smart plug): the actual cycles — the script + workflow print loud OPEN banners and never fake green. **H1 stays OPEN-pending-owner-run: the OWNER triggers `m8-h1.yml` on the wired rig, observes ≥50 PASS with the H2 probe proven + evidence on #18, and closes #18. The agent never self-certifies H1.** +- **(2026-06-25, PRs #20 + #21 off `main`): dm-flakey CI now RUNS, H3-physical PASSES, §14.4d is three-tier.** PR #20 (`claude/m8-dmflakey-ci-fixes`) fixes the hosted dm-flakey gate: provision `linux-modules-extra-$(uname -r)` + `modprobe dm_flakey` (dm-flakey **is** reachable on hosted Azure runners — no self-hosted runner needed), `cmd_check` queries `dmsetup targets` **as root**, and dm table reloads use `dmsetup suspend --noflush --nolockfs` in **both** `flakey_fault` and `flakey_up` (a default suspend's lockfs **freeze** is a full fs-sync that either EIO'd through the erroring target — misread as a §12 violation — or persisted the un-synced data before the drop, defeating the §14.4d controls). **Result: H3-physical ext4 PASSES** (source-confirmed block-layer EIO → §12 poison; evidence on issue #16). PR #21 (`claude/m8-dirfsync-tiers`, stacked on #20) resolves §14.4d per the designer: **the dir-fsync omission is NOT reproducible on ext4/xfs/btrfs** — those journaling FSes transitively persist a new file's dir entry on the segment's own `fsync` (AFSNCE OSDI '14, §18), masking it; `fsync_dir` is kept as a portable-durability safeguard. Three tiers: **Tier-1 (PRIMARY, per-PR, deterministic) = `scripts/m8/dirfsync-presence.sh`** straces the roll path, asserts correct issues the roll-time dir-`fsync` while `inject_no_dir_fsync` does not — **RUN+green here** (`correct=5` vs `inject=1`), wired into `ci.yml`; **Tier-2 = behavioral power-loss via a synchronized mid-run cut** (`src/bin/dirfsync_cut_workload.rs` rolls once, acks a record into the new segment, blocks with the dirent dirty; harness activates `drop_writes` *before* kill/umount, fsck, remount, verify) — **CLOSED as a DOCUMENTED NEGATIVE RESULT (PR #21, owner Fedora 43):** the inject build recovers fully on EVERY config tested — ext4/xfs/btrfs, journal-less ext4 (incl. `ext2`-format), and the last attempt, journaled ext4 `data=writeback` (the ext4 driver's weakest ordering; `data=writeback` weakens data ordering, not the metadata/dirent). The dirent reaches disk transitively via the file's own `fdatasync` everywhere. **Mechanism correction:** the earlier "ext2 block-adjacency" claim is RETRACTED — dmesg shows `ext2`-format is serviced by the **ext4 driver journal-less** on modern kernels (standalone ext2 driver removed in Linux 6.9); mechanism not isolated. No readily-available Linux FS exposes it behaviorally ⇒ honest negative result, not a gap. Tier-1 strace carries the DoD; `fsync_dir` retained as a POSIX-portability safeguard. (Note: `data=writeback` requires a journal — NOT combinable with `-O ^has_journal`.); **Tier-3 = ext4/xfs/btrfs INCONCLUSIVE-by-design** (informational, never red on a masked miss, still red on a correct-build data loss). dm-flakey harness also got `wipefs`/zero-before-mkfs + `udevadm settle` + `dmsetup remove --retry/-f --deferred` (fixes the back-to-back "device busy"). Docs corrected (design §14.4d note + §14.13 row, runbook three-tier, this block). `shellcheck`+`cargo fmt --check` clean; the strace gate is self-verified green. **§14.4d behavioral (Tier-2) is now CLOSED as a documented negative result** (Tier-1 satisfies the DoD). **Still owner/CI to observe:** H1 power-pull. - **Current milestone:** M8 (hardware/platform durability, §14.8 + the deferred §14.4d) — **harnesses + runbook BUILT; the runnable-here pieces are RUN+green; the physical gates are honestly OPEN-pending-owner-run, never self-certified from this sandbox.** **What RUNS green here:** **H2** the deny-by-default storage durability guard (`scripts/m8/storage-check.sh` — passes on the repo's ext4, FAILs on tmpfs; the vacuous-pass guard H1 depends on, rejecting tmpfs/overlay/unrecognized FS); and the **H3 §12 poison *state machine*** (`scripts/m8/fsync-fault.sh` + `tests/fsync_fault_gate.rs`, 3 tests) — an `LD_PRELOAD` shim (`tests/fault/eio_preload.c`) returns EIO from the commit's libc `fdatasync` and the gate asserts `FsyncFailed`, **no `durable_lsn` advance past the synced segment** (incl. the split-batch **rest-at-seg1-max** partial advance), and handle **poison** (subsequent ops `Poisoned`), with an **anti-vacuous guard** that the injection actually fired (the gate **fails loudly** if run without the shim — demonstrated). Shim interception was **empirically proven the ship/drop gate** (`strace`: 6 `fdatasync` all intercepted, 3 `fsync` — the rustix raw-syscall dir-fsync — none ⇒ the shim bounds to the data-sync poison path; the dir-fsync poison + §14.4d stay dm-flakey-only). **What is OPEN-pending-owner-run** (this sandbox's kernel has **no `CONFIG_BLK_DEV_DM`/`/lib/modules`/`/dev/mapper/control`**, no cuttable target, and is Linux not macOS): **H3 physical** (`scripts/m8/dm-flakey.sh h3` — `error_writes` → block-layer EIO → poison, workload exit 7); **§14.4d** (`dm-flakey.sh dirfsync-negative` — correct vs `--features inject_no_dir_fsync` across a `drop_writes` power loss; **now three-tier — see the LATEST bullet at the top: Tier-1 strace presence PASSES per-PR, Tier-2 certifies on ext2 not ext4, ext4/xfs/btrfs are INCONCLUSIVE-by-design**); **H1** power-pull (`src/bin/power_pull_{workload,verify}.rs` + `scripts/m8/power-pull.sh`, ≥50 cycles zero acked loss — off-box **network** side channel, **send-strictly-after-`commit() Ok`** ack-ordering, **contiguous-watermark** conservative verify, **H2-gated**; the full chain was **dry-run green on loopback** and the verifier's falsifiability shown: simulated acked-loss → FAIL/D1, side-channel gap → INCONCLUSIVE); (H4 was the macOS-tier item — **now CLOSED, see next sentence**). Every OPEN gate prints a loud "NOT EXERCISED"/OPEN banner (mirrors the LazyFS stopgap) — **no fake green.** **H4 macOS `F_FULLFSYNC` — VERIFIED & CLOSED (owner-run):** ran on owner macOS hardware (Mac mini, SIP-enabled, 2026-06-25) — `tests/macos_fullfsync.rs` smoke green **and** the `#[ignore]`d `dtruss -t fcntl` proof shows `F_FULLFSYNC` (cmd `0x33`) issued twice on the durable path, both succeeding. A matcher fix (commit `ba2b84d`) reads dtruss's **numeric** fcntl command, since SIP-enabled dtruss does not symbolize `F_FULLFSYNC` by name. Owner procedure in `docs/m8-runbook.md`. No `src/` contract change (harnesses + tests + bins + docs only; two `power_pull` bins added `WAL_SEGMENT_SIZE`/`WAL_MAX_RECORD_SIZE` env overrides for the §14.4d roll forcing). **M8 test-automation (Tier 1 + Tier 3):** added `scripts/m8/evidence.sh` (shared §5 evidence-ledger emitter), enhanced `dm-flakey.sh` with the amended anti-vacuous criteria (#16 PASS now ANDs WAL poison with a **source-confirmed block-layer EIO** scraped from `dmesg` in the injection window, + bounded retry; #17 bounded retry budget **plus a `drop_writes` positive control** — if drop_writes is inert the negative control is non-functional ⇒ exit 4 HARNESS, louder than a timing INCONCLUSIVE; INCONCLUSIVE≠PASS; verdict exit codes 0/1/2/3/4) + evidence emission (incl. `block_layer_eio_observed`, `dmesg_readable`, `drop_positive_control`), and added `.github/workflows/m8-dmflakey.yml` (**push-to-main** paths-filtered + nightly + dispatch; **H3-physical #16** + **§14.4d #17** on hosted ubuntu VMs that reach `dm-flakey` — ext4 hard, xfs/btrfs informational, best-effort + loud skip — a green loud-skip is **not** a passed gate) and `m8-macos.yml` (`macos-latest`; **H4 Half A #19** routing/smoke; **per-PR paths-filtered + push-to-main + dispatch**, since a macOS-only `F_FULLFSYNC`-routing regression is invisible to Linux PR CI). Both upload the §5 artifact every run and post to the tracking issue **only on `workflow_dispatch`** (human sign-off; nightly stays artifact-only + red-build). `m8.yml`'s contradictory "physical-availability" job was trimmed to a pointer. The genuine proof is each workflow's first green run; until observed the dm-flakey gates are **contingent** (and loud-skip if a runner lacks the target). `cargo test`, `cargo clippy --all-targets -D warnings`, `cargo fmt --check` green; `shellcheck` + `actionlint` clean on the new scripts/workflows. **Still NOT done:** **H1** power-pull (owner-run, the only gate still needing a cuttable target) + the §14.4d/H3-physical first-green CI observation; **H4 done** (verified on owner macOS; Half A now also in CI, Half B owner-run); fuzzing (F1–F4)/Miri/soak (M9). - **M7** (performance: criterion benches + regression gates + zero-alloc, §14.7) — **COMPLETE, with the regression-gate CI *enforcement* tracked OPEN-pending-controlled-runner (honest, not "done")**. New `benches/wal.rs`: four criterion groups over the public API against a **real `fdatasync`** (never mocked) — `throughput` (64 B/256 B/4 KiB/64 KiB, `Throughput::Bytes`), `commit_latency` (batch 1/8/64/512/4096; 8 MiB segment so even batch=4096 never rolls — pure group-commit amortization), `recovery` (`Wal::open` vs log size + segment count), `split_batch` (spanning vs not — quantifies the extra fsync). Fixtures built **outside** the measured closure (`iter_batched*`/`iter_custom` setup; fresh WAL per iter so the log can't grow into a roll). **Tail percentiles:** criterion reports only mean/median (no arbitrary percentiles), so `commit_latency` records per-iter timings into an `hdrhistogram` and emits **p50/p99/p999** itself, persisted to `target/perf/commit_latency_.json`. **Zero-alloc gate hardened** (`tests/zero_alloc.rs`): now *proves* the measured window did not roll (segment-file count + `durable_lsn` advance, both read outside the counted region — no segment accessor needed) and adds a `max_record_size` (256 B) payload variant; kept `SERIAL` + warm-up; 3 tests green. **Regression gate** `scripts/perf-gate.sh` (`baseline`/`compare`/`check`/`inspect`): throughput/**median-time** delta from criterion `estimates.json` (the **median** point estimate — not the outlier-sensitive mean), **p999** delta from the histogram JSON; thresholds **>10%** time / **>20%** p999 (tunable); needs `python3`. **Falsifiability (§14.0.3) demonstrated:** an 800 µs sleep injected into the timed `commit()` window tripped the gate (median-time +160%+ on small batches *and* a p999 breach, `check` exit 1), then reverted. **CI tiering (§14.11):** per-PR `ci.yml` gains `cargo bench --no-run` (benches can't bitrot) and already runs the zero-alloc gate via `cargo test` (both enforced); new `.github/workflows/bench.yml` runs the benches + gate **nightly/manual, informational** (`continue-on-error`, uploads `target/criterion`+`target/perf` artifacts) — same stopgap as the LazyFS gate; the >10%/>20% thresholds stay a real gate on a controlled/pinned-governor runner. **No `src/` change** (benches drive the public API; no steady-state alloc or perf bug found to flag). MSRV (1.85) re-verified with the new dev-deps (`criterion` default-features-off, `hdrhistogram`) via `cargo +1.85.0 check --all-targets --locked`. Docs: §14.7 M7 block, §14.11 split, §14.13 zero-alloc row, v6.1 changelog, `scripts/README.md` perf-gate section. `cargo test` / `cargo clippy --all-targets -D warnings` / `cargo fmt --check` all green. **Still NOT done (M8+):** §14.8 hardware durability incl. the §14.4d metadata-fault negative control (M8); fuzzing (F1–F4)/Miri/soak (M9); the perf-gate's enforced CI run on a controlled runner. - **M6 (stateful model/oracle harness, §14.3) — COMPLETE as an in-tree proptest harness; the §14.5 F4 cargo-fuzz variant is DEFERRED to M9** (like F1/F2). New `tests/model_oracle.rs` generates boundary-biased randomized op-scripts (`Op::{Append, Commit, Checkpoint(up_to), CrashAndRecover, Reopen}`; payloads at 0/1/8/`max_record_size`/random, tiny 256/165 config forcing rolls+splits, `u64::MAX` checkpoint clamping to exactly `durable_lsn`, `PROPTEST_CASES`-overridable per §14.11 — default 96, smoke-run at 4000 green) and drives them through the **proptest-free, F4-reusable** executor `tests/model/mod.rs::run(cfg, ops)`. The executor checks the §14.3 envelope as a **refinement relation** (NOT equality) against an **independent** in-memory oracle (`committed: BTreeMap>` + `staged` Vec + `oldest_lsn`/`durable_lsn`/`next_lsn`/`max_ckpt_up_to`; the committed map is the source of truth, checked *before* any resync, and only contract-permitted tail watermarks are adopted from the impl): after every reopen it asserts ⊇ committed (D1/D3), dense `oldest..=durable` reaching `durable_lsn` (D2), byte-identical records (D6), no unauthorized loss (a committed record below `oldest` must be `≤ max_ckpt_up_to` — D8/D3), `durable`/`oldest` monotonicity, D7 idempotence on a no-mutation reopen, and the §15.4 below-oldest fatal gap. The crash model is **state-machine only** (§14.0): `CrashAndRecover` drops the handle without committing (lose staged ⇒ D3); `Reopen` commits-then-drops (clean restart ⇒ D7); it does **not** model power loss/torn tails (that is §14.4b/c, already passing). Refinements applied from review: the live post-checkpoint probe uses `reader_from(0)` only (no authoritative live `oldest_lsn`) and a **terminal reopen** anchors the D8 over-delete check; empty-commit is an explicit no-op (`w == prior durable`); beyond-watermark records must preserve density (strict). **Falsifiability (§14.0.3) demonstrated:** a seeded recovery loss bug (`last_lsn = max_lsn-1` on recover) trips `D1/D3: recovered durable_lsn < committed watermark` and a seeded checkpoint over-delete (`deletable_prefix_len` off-by-one) trips `D8: oldest_lsn exceeds max_ckpt_up_to+1`, each shrinking to a minimal op-script; both reverted. No `src/` change (harness drives only the public API). Docs updated: §14.3 M6 note, §14.5/§14.13 F4 deferral, §14.12 D7/D8 rows, v6.1 changelog. `cargo test` (all suites incl. `model_oracle`, 7 LazyFS still `#[ignore]`), `cargo clippy --all-targets -D warnings`, `cargo fmt --check` all green. **Still NOT done (M7+):** §14.7 benches (M7); §14.8 hardware durability incl. the §14.4d metadata-fault negative control (M8); fuzzing (F1–F4)/Miri/soak (M9). diff --git a/docs/m8-infra-plan.md b/docs/m8-infra-plan.md index 90c332b..ce6204f 100644 --- a/docs/m8-infra-plan.md +++ b/docs/m8-infra-plan.md @@ -78,7 +78,7 @@ Why this split: the controller hosts the GitHub runner and must never be cut (a - **Dedicated writable WAL partition = the DUT.** The only thing exposed to power-cut writes is the WAL under test, on its own ext4 partition. Provide for **three DUT media**, each a different device class: (a) a partition on the **microSD** (Pi), (b) the **USB-SSD** (Pi), (c) the **onboard eMMC** of the **BeagleBone Black**. Run the gate against each; record which. (eMMC is soldered, managed NAND with its own controller/cache — the most production-realistic embedded medium and the one whose flush honesty is most likely to surprise you, so it broadens the device-honesty coverage the rig produces.) - For the BeagleBone target, boot its **rootfs from microSD with the read-only overlay** (same cut-corruption protection as the Pi) and put the **WAL on a dedicated eMMC partition** as the DUT — so the eMMC is the thing under test and the OS isn't the thing 50 cuts corrupt. The BBB is ARMv7/battery-less with a single 5 V input, so it's a valid smart-plug cut target; it is **not** a controller candidate (32-bit → no GitHub runner), only a DUT. - **DUT media are consumables.** The boards themselves shrug off power cycling (effectively unlimited at these counts). The flash media is the wear surface, and the binding risk is **not** write-endurance — 50 cycles write only single-digit GB, orders of magnitude under any card's lifetime — but **sudden FTL (flash-translation-layer) corruption on a mid-write cut**, which can brick a whole card. The per-cut probability is low and *device-quality-dependent* (a cheap no-name microSD is the most exposed; the read-only overlay already protects the boot/OS card regardless). So: keep **one or two spares of each DUT medium**, keep a **pre-imaged OS/boot card** so a brick is a 5-minute re-flash rather than a re-setup, and treat a **mid-campaign card death as a recordable device-honesty finding** (that card is empirically not honest hardware — exactly the verdict the gate exists to render), not a rig failure. Record it in the evidence artifact and continue on a spare. -- **Cross-compiled binaries**, built on the controller (`aarch64-unknown-linux-gnu`, MSRV 1.85, via `cross` or rustup target + linker) and `scp`'d to the Pi: `power_pull_workload`, `power_pull_verify`, and `storage_probe`. Do not build on the Pi (slow, and we want the runner to control versions). +- **Cross-compiled binaries**, built on the controller (`aarch64-unknown-linux-gnu`, MSRV 1.85, via `cross` or rustup target + linker) and `scp`'d to the Pi: `power_pull_workload` and `power_pull_verify`. The marker probe is the existing **`scripts/m8/storage-check.sh`** (`probe-write`/`probe-verify`, shell — no separate `storage_probe` binary needed), deployed alongside them. Do not build on the Pi (slow, and we want the runner to control versions). `scripts/m8/h1-cycle.sh deploy` performs this `scp`. - Passwordless ssh from controller → Pi (key auth) for unattended cycling. ### 3.3 Controller setup @@ -90,10 +90,11 @@ Why this split: the controller hosts the GitHub runner and must never be cut (a ### 3.4 Cut-mechanism calibration + H2 gate — THE FIRST MILESTONE (before any cycle counts) -Do **not** run the 50-cycle loop until this passes. On the exact DUT medium: -1. `storage_probe write-unsynced-marker ` — write a marker **without** `fdatasync` (it must sit in the page cache / device cache, not on stable flash). +Do **not** run the 50-cycle loop until this passes. On the exact DUT medium (this is the +`h1-cycle.sh calibrate` step, run automatically as the first step of every `run`): +1. `storage-check.sh probe-write ` — write a marker **without** `fdatasync` (it must sit in the page cache / device cache, not on stable flash). 2. Cut power via the smart plug; restore; wait for boot. -3. `storage_probe verify-marker-gone ` — the marker **MUST be absent**. +3. `storage-check.sh probe-verify ` — the marker **MUST be absent** (exit 0 = gone; exit 1 = survived). - **Gone ⇒ the cut is real** (un-synced data is genuinely lost) ⇒ proceed. - **Survives ⇒ vacuous** (storage didn't lose un-synced data — e.g. mounted `sync`, or the probe accidentally flushed) ⇒ **abort, fail loudly**, do not run cycles. Investigate the mount / probe before continuing. diff --git a/docs/m8-runbook.md b/docs/m8-runbook.md index 481efdd..8c46c2e 100644 --- a/docs/m8-runbook.md +++ b/docs/m8-runbook.md @@ -275,6 +275,80 @@ Each cycle (≥50×): **PASS the gate** only after **≥50 consecutive cycles with zero FAIL.** Record the device, cache mode, and cut mechanism. +### Automated rig (`h1-cycle.sh` + `m8-h1.yml`) + +The manual loop above is fully automated by **`scripts/m8/h1-cycle.sh`** (orchestration +on the controller) and **`.github/workflows/m8-h1.yml`** (`workflow_dispatch`-only, on a +`[self-hosted, h1-rig]` runner). The script reuses the same binaries/scripts — it adds no +`src/` code — and bakes in every honesty rail: the **§3.4 calibration gate runs first**, +INCONCLUSIVE never counts, a FAIL stops the run, and `verdict=PASS` is emitted only when the +H2 probe proved loss **and** `fail==0`. + +```bash +scripts/m8/h1-cycle.sh config # print the resolved config (touches no hardware) +scripts/m8/h1-cycle.sh deploy # scp the aarch64 bins + storage-check.sh to the target +scripts/m8/h1-cycle.sh calibrate # §3.4 vacuous-pass GATE (real cut; marker MUST be gone) +scripts/m8/h1-cycle.sh cycle # the ≥N-consecutive-PASS loop +scripts/m8/h1-cycle.sh run # (default) calibrate → cycle → emit §5 evidence +``` + +Config is via env (see `config` for the full list): `H1_TARGET_SSH`, `H1_WAL_DIR` (the DUT +partition), `H1_DUT_MEDIUM`, `H1_CONTROLLER_IP`, `H1_PORT` (9099), `H1_PLUG_TYPE`/`H1_PLUG_IP`, +`H1_CYCLES` (50), `H1_WORKLOAD_SECS`, `H1_OFF_SECS`, `H1_BOOT_TIMEOUT`, `H1_INFRA_FAIL_MAX`. +`H1_PLUG_DRY_RUN=1` echoes the cut URL instead of curling it (for a no-hardware dry run). + +**Smart-plug driver (pluggable).** The cut is a real mains interrupt via a plug with a +**local** HTTP API (cloud-only plugs are unusable): + +| `H1_PLUG_TYPE` | OFF / ON request | +|---|---| +| `shelly` (Gen2/Gen3/Plus RPC — **default; the Shelly Plug S Gen3**; aliases `shelly-gen2`/`shelly-gen3`) | `GET /rpc/Switch.Set?id=&on=false` / `on=true` | +| `shelly-gen1` (legacy) | `GET /relay/0?turn=off` / `turn=on` | +| `tasmota` | `GET /cm?cmnd=Power%20Off` / `Power%20On` | + +A toggle that silently no-ops is caught downstream: after a cut the target **must** go away +and come back, so the post-cut boot-wait fails the cycle (INCONCLUSIVE) rather than passing +vacuously. + +**CI usage.** Trigger **`m8-h1.yml`** manually (`workflow_dispatch`); it cross-compiles the +ARM bins, `deploy`s, runs the calibration + cycle loop, uploads the §5 evidence artifact, and +posts the ledger to **#18**. Rig config comes from repo **Variables** (`H1_TARGET_SSH`, +`H1_WAL_DIR`, `H1_CONTROLLER_IP`, `H1_PLUG_IP`, …); if they're unset or the target is +unreachable the job **loud-skips** (OPEN, not a pass). Green = the configured `cycles` +consecutive PASS with the H2 probe proven; red = a D1 FAIL or an aborted (vacuous) calibration. +**The agent never self-certifies H1 — the OWNER reviews the evidence and closes #18.** + +### Rig setup (target + controller) + +**Target — the thing that gets cut** (Raspberry Pi 3 by default; BeagleBone Black and a +USB-SSD are the other two DUT media): +- 64-bit Pi OS, **wired Ethernet** (Wi-Fi reassociation adds side-channel/ssh latency). +- **Read-only rootfs (overlayfs)** via `raspi-config` → Performance → Overlay FS. Essential: + repeatedly power-cutting a *writable* rootfs corrupts the OS within a handful of cycles; + the overlay lets the OS survive indefinitely. +- A **dedicated writable ext4 partition** is the DUT (`H1_WAL_DIR` lives here) — the only + thing exposed to the cut's writes. Pass the **H2 empirical probe** on it once (the + `calibrate` step does this automatically each run). +- For the **BeagleBone** target: boot its rootfs from a microSD with the same read-only + overlay and put the WAL on a **dedicated eMMC partition** (the eMMC is the DUT, not the OS). +- For the **USB-SSD** target: a cheap USB 3.0 SSD on the Pi; note in the evidence that + consumer USB-SSDs without PLP may lie about flushes (that's a §3.6 *finding*, not a WAL bug). +- Passwordless ssh (key auth) from the controller. DUT media are consumables — keep spares + and a pre-imaged boot card (a mid-write cut can brick a card; treat that as a device-honesty + finding). + +**Controller — the laptop that is NEVER cut:** +- Hosts the GitHub **self-hosted runner** labelled `h1-rig`, the **off-box collector** + (`h1-cycle.sh` starts socat/ncat/nc on `H1_PORT`), and the **smart-plug driver**. +- Cross-compiles the ARM bins: `rustup target add aarch64-unknown-linux-gnu` + a linker + (`gcc-aarch64-linux-gnu`, env `CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc`), + or use `cross`. Do **not** build on the Pi (slow; the runner must control versions). + +**Power wiring.** A small **power strip** feeds dedicated 5 V supplies for each board (Pi 3: +5.1 V/3 A micro-USB; BeagleBone: 5 V/2 A barrel, center-positive); plug the strip into the +**smart plug**. One toggle cuts both boards. **Cut at mains, never the laptop** — the laptop's +battery means a "cut" isn't one. + ### Cut mechanisms and their fidelity | Mechanism | Fidelity | Notes | @@ -285,7 +359,9 @@ device, cache mode, and cut mechanism. | `reboot` / `shutdown` | **NOT VALID** | graceful; flushes caches. | **Status: OPEN-pending-owner-run** on real (or properly cache-configured virtual) -hardware. +hardware. The automated rig (`scripts/m8/h1-cycle.sh` + `.github/workflows/m8-h1.yml`) +is **built and lint-clean**; it stays OPEN until the owner triggers it on a wired rig, +observes ≥50 PASS with the H2 probe proven, and closes #18. The agent never self-certifies H1. --- diff --git a/scripts/README.md b/scripts/README.md index 75a62b1..6ed5c30 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -148,7 +148,8 @@ fakes green — the owner-run gates print loud "NOT EXERCISED"/OPEN banners. | `m8/storage-check.sh` | **H2** vacuous-pass guard (deny-by-default FS/cache classification + empirical loss probe) | static part **yes** | | `m8/fsync-fault.sh` | **H3 §12 poison state machine** (LD_PRELOAD EIO shim) | **yes — green** | | `m8/dm-flakey.sh` | **H3 physical** + **§14.4d** dir-fsync negative control | **nightly CI** (hosted ubuntu VMs reach dm-flakey; `m8-dmflakey.yml`); the build sandbox lacked it | -| `m8/power-pull.sh` | **H1** power-pull (≥50 cycles, zero acked loss) | no (needs a cuttable target) | +| `m8/power-pull.sh` | **H1** power-pull primitives (workload/receiver/verify; manual cut) | no (needs a cuttable target) | +| `m8/h1-cycle.sh` | **H1** power-pull AUTOMATION (deploy → §3.4 calibrate → ≥50-PASS cycle loop → §5 evidence; pluggable smart-plug cut) | no (needs a wired rig + smart plug) | | `m8/evidence.sh` | shared **§5 evidence-ledger** JSON emitter (reused by the gates above) | n/a (helper) | ### Runs here (CI-safe) @@ -173,9 +174,18 @@ rustix directory fsync needs the block-layer gate). scripts/m8/dm-flakey.sh check # detect device-mapper; loud OPEN banner if absent sudo scripts/m8/dm-flakey.sh h3 ext4 # physical fsync-failure → poison sudo scripts/m8/dm-flakey.sh dirfsync-negative ext4 # §14.4d (certify on ext4; FS-/timing-sensitive) -scripts/m8/power-pull.sh cycle # prints the ≥50-cycle power-pull procedure +scripts/m8/power-pull.sh cycle # prints the ≥50-cycle power-pull procedure (manual cut) +scripts/m8/h1-cycle.sh config # H1 automation: show resolved rig config (no hardware) +scripts/m8/h1-cycle.sh run # H1 automation: calibrate + ≥50-PASS loop + §5 evidence ``` +The H1 automation (`h1-cycle.sh`, driven in CI by `.github/workflows/m8-h1.yml`, +`workflow_dispatch`-only on a `[self-hosted, h1-rig]` runner) cuts via a pluggable +smart-plug local API (`H1_PLUG_TYPE`: `shelly` = Gen2/Gen3/Plus RPC, `shelly-gen1`, +`tasmota`) and runs the §3.4 calibration vacuous-pass gate first. **H1 is owner-run and +never self-certified** — see `docs/m8-runbook.md` → "H1 — power-pull" for the rig setup, +config vars, and the #18 sign-off flow. + See `docs/m8-runbook.md` for cut mechanisms (and why `sysrq-b`/`reboot` are **not** valid cuts), the network side-channel topology, the FS matrix, and the §14.4d filesystem-dependence caveat. diff --git a/scripts/m8/h1-cycle.sh b/scripts/m8/h1-cycle.sh new file mode 100644 index 0000000..9c8608a --- /dev/null +++ b/scripts/m8/h1-cycle.sh @@ -0,0 +1,365 @@ +#!/usr/bin/env bash +# +# h1-cycle.sh — M8 / §14.8 H1 power-pull AUTOMATION (OWNER-RUN on a wired rig). +# +# H1 is the only TRUE durability test and CANNOT be self-certified in a sandbox: +# it needs a genuine HARD power cut (mains interrupt) on storage that actually +# loses un-synced data, ≥50 consecutive cycles with zero acked-LSN loss (D1). +# This orchestrator drives that loop end-to-end on the OWNER's physical rig; it +# reuses the proven binaries/scripts and adds NO src/ code. +# +# Topology (docs/m8-infra-plan.md §3.1, docs/m8-runbook.md): +# [CONTROLLER laptop — NEVER cut] this script + collector + smart-plug driver +# | ssh / scp (wired Ethernet) ^ TCP seq,watermark (durable off-box) +# v | +# [TARGET — gets cut] power_pull_workload on the DUT medium (microSD/USB-SSD/eMMC) +# | (mains) -> power strip -> Pi/BBB PSUs +# [SMART PLUG local HTTP API] <-- this script toggles to CUT/RESTORE +# +# The cut is a REAL mains interrupt via the smart plug. sysrq-b / reboot / shutdown +# are warm/graceful and DO NOT model power loss — they are NOT valid cuts. +# +# Subcommands: +# deploy cross-built ARM bins + storage-check.sh -> target ($H1_BIN_DIR) +# calibrate §3.4 vacuous-pass GATE: prove the DUT loses un-synced data across a +# REAL cut (un-synced marker must be GONE). Vacuous => abort, no cycles. +# cycle the ≥50-consecutive-PASS loop (§3.5). FAIL stops the run. +# run (default) config-check -> calibrate -> cycle -> emit §5 evidence. +# config print the resolved config and exit (no hardware touched). +# +# Honesty rails (M8 ground rules): the calibration GATE runs first and aborts loudly +# if the DUT didn't lose un-synced data; INCONCLUSIVE never counts toward 50; a FAIL +# stops the run; verdict=PASS is emitted ONLY when the H2 probe proved loss AND +# fail==0 AND zero counted INCONCLUSIVE. Nothing here fakes green. +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +# --- Config (env, with defaults) -------------------------------------------- +# Target / DUT +H1_TARGET_SSH="${H1_TARGET_SSH:-}" # e.g. pi@10.0.0.3 (passwordless key auth) +H1_WAL_DIR="${H1_WAL_DIR:-}" # DUT WAL dir on the target (own ext4 partition) +H1_DUT_MEDIUM="${H1_DUT_MEDIUM:-unspecified}" # microSD | USB-SSD | eMMC(BeagleBone) +H1_BIN_DIR="${H1_BIN_DIR:-/home/${H1_TARGET_USER:-pi}/m8}" # where deploy puts bins on the target +H1_LOCAL_BIN_DIR="${H1_LOCAL_BIN_DIR:-${REPO_ROOT}/target/aarch64-unknown-linux-gnu/release}" +# Side channel +H1_CONTROLLER_IP="${H1_CONTROLLER_IP:-}" # IP the target streams seq,watermark to +H1_PORT="${H1_PORT:-9099}" +# Smart plug (pluggable). shelly = Gen2/Gen3/Plus RPC (the owner's Shelly Plug S Gen3). +H1_PLUG_TYPE="${H1_PLUG_TYPE:-shelly}" # shelly | shelly-gen2 | shelly-gen3 | shelly-gen1 | tasmota +H1_PLUG_IP="${H1_PLUG_IP:-}" +H1_PLUG_ID="${H1_PLUG_ID:-0}" # Shelly switch id +H1_PLUG_DRY_RUN="${H1_PLUG_DRY_RUN:-0}" # 1 = echo the URL instead of curling (no hardware) +# Loop tuning +H1_CYCLES="${H1_CYCLES:-50}" # required CONSECUTIVE PASS +H1_WORKLOAD_SECS="${H1_WORKLOAD_SECS:-5}" # commit window before the cut +H1_OFF_SECS="${H1_OFF_SECS:-4}" # power-off duration +H1_BOOT_TIMEOUT="${H1_BOOT_TIMEOUT:-90}" # seconds to wait for ssh after restore +H1_INFRA_FAIL_MAX="${H1_INFRA_FAIL_MAX:-5}" # consecutive infra failures => abort +# Evidence +WAL_M8_EVIDENCE="${WAL_M8_EVIDENCE:-${REPO_ROOT}/m8-evidence/evidence-h1.json}" + +log() { printf '\033[1;34m[m8/h1]\033[0m %s\n' "$*" >&2; } +warn() { printf '\033[1;33m[m8/h1] WARN:\033[0m %s\n' "$*" >&2; } +pass() { printf '\033[1;32m[m8/h1] PASS:\033[0m %s\n' "$*" >&2; } +die() { printf '\033[1;31m[m8/h1] ERROR:\033[0m %s\n' "$*" >&2; exit 1; } + +# Loud OPEN banner — H1 is owner-run; this never fakes green. +banner_open() { + printf '\033[1;31m' >&2 + cat >&2 <&2 +} + +require() { [ -n "${!1:-}" ] || die "config $1 is required (see 'h1-cycle.sh config')"; } + +ssh_target() { + ssh -o BatchMode=yes -o ConnectTimeout=10 -o StrictHostKeyChecking=accept-new \ + "$H1_TARGET_SSH" "$@" +} + +# --- Smart-plug driver (pluggable by endpoint) ------------------------------ +# shelly (Gen2/Gen3/Plus) share the RPC API; gen1 is the legacy /relay/0; tasmota +# is /cm?cmnd=Power. A switch toggle that silently no-ops is caught downstream by +# the post-cut boot-wait (the target must actually go away and come back). +plug_url() { # plug_url on|off + local state="$1" + case "$H1_PLUG_TYPE" in + shelly|shelly-gen2|shelly-gen3|shelly-plus) + local on=false; [ "$state" = on ] && on=true + printf 'http://%s/rpc/Switch.Set?id=%s&on=%s' "$H1_PLUG_IP" "$H1_PLUG_ID" "$on" ;; + shelly-gen1) + local turn=off; [ "$state" = on ] && turn=on + printf 'http://%s/relay/0?turn=%s' "$H1_PLUG_IP" "$turn" ;; + tasmota) + local p=Off; [ "$state" = on ] && p=On + printf 'http://%s/cm?cmnd=Power%%20%s' "$H1_PLUG_IP" "$p" ;; + *) + die "unknown H1_PLUG_TYPE='$H1_PLUG_TYPE' (shelly|shelly-gen1|tasmota)" ;; + esac +} + +plug_set() { # plug_set on|off + local state="$1" url + url="$(plug_url "$state")" + if [ "$H1_PLUG_DRY_RUN" = 1 ]; then + log "DRY-RUN plug $state: GET $url" + return 0 + fi + require H1_PLUG_IP + # -fsS: fail on HTTP error, silent, show errors. A failed toggle is fatal — a + # missed cut would make the cycle vacuous. + curl -fsS --max-time 10 "$url" >/dev/null \ + || die "smart-plug $state failed (GET $url) — cannot trust the cut. Aborting." + log "plug $state ($H1_PLUG_TYPE @ ${H1_PLUG_IP:-dry})" +} + +plug_off() { plug_set off; } +plug_on() { plug_set on; } + +# Wait for the target's ssh to come back after a restore. Returns 0 if reachable +# within H1_BOOT_TIMEOUT, 1 otherwise (caller treats as INCONCLUSIVE/infra). +wait_ssh() { + local deadline=$(( SECONDS + H1_BOOT_TIMEOUT )) + log "waiting for target ssh (timeout ${H1_BOOT_TIMEOUT}s)…" + while [ "$SECONDS" -lt "$deadline" ]; do + if ssh_target true 2>/dev/null; then + log "target is up." + return 0 + fi + sleep 3 + done + warn "target did not return within ${H1_BOOT_TIMEOUT}s." + return 1 +} + +# --- Collector (off-box side channel; runs on the CONTROLLER) --------------- +COLLECTOR_PID="" +start_collector() { # start_collector + local cap="$1" + : > "$cap" # fresh per-cycle capture + if command -v socat >/dev/null 2>&1; then + socat -u "TCP-LISTEN:${H1_PORT},reuseaddr,fork" "OPEN:${cap},creat,append" & + elif command -v ncat >/dev/null 2>&1; then + ncat -lk "$H1_PORT" >> "$cap" & + elif command -v nc >/dev/null 2>&1; then + nc -lk "$H1_PORT" >> "$cap" & + else + die "no socat/ncat/nc on the controller — install one for the off-box side channel." + fi + COLLECTOR_PID=$! + log "collector listening on tcp/:${H1_PORT} -> $cap (pid $COLLECTOR_PID)" +} +stop_collector() { + [ -n "$COLLECTOR_PID" ] || return 0 + kill "$COLLECTOR_PID" 2>/dev/null || true + wait "$COLLECTOR_PID" 2>/dev/null || true + COLLECTOR_PID="" +} + +# --- deploy ----------------------------------------------------------------- +cmd_deploy() { + require H1_TARGET_SSH + local wbin="${H1_LOCAL_BIN_DIR}/power_pull_workload" + local vbin="${H1_LOCAL_BIN_DIR}/power_pull_verify" + if [ ! -x "$wbin" ] || [ ! -x "$vbin" ]; then + die "cross-built bins not found in $H1_LOCAL_BIN_DIR (build for aarch64 first; see the runbook)." + fi + log "deploying bins + storage-check.sh to ${H1_TARGET_SSH}:${H1_BIN_DIR}" + ssh_target "mkdir -p '$H1_BIN_DIR'" + scp -q "$wbin" "$vbin" "${REPO_ROOT}/scripts/m8/storage-check.sh" \ + "${H1_TARGET_SSH}:${H1_BIN_DIR}/" + ssh_target "chmod +x '$H1_BIN_DIR/power_pull_workload' '$H1_BIN_DIR/power_pull_verify' '$H1_BIN_DIR/storage-check.sh'" + pass "deployed to ${H1_BIN_DIR}" +} + +# --- calibration: the §3.4 vacuous-pass GATE -------------------------------- +# Prove the DUT medium genuinely loses un-synced data across a REAL cut, BEFORE any +# cycle counts. Marker survives => storage didn't lose it => abort (vacuous H1). +# Sets H2_PROBE (PASS(marker gone) | FAIL(survived)) for the evidence ledger. +H2_PROBE="not-run" +cmd_calibrate() { + require H1_TARGET_SSH; require H1_WAL_DIR + log "§3.4 calibration: static H2 classification of the DUT…" + ssh_target "'$H1_BIN_DIR/storage-check.sh' classify '$H1_WAL_DIR'" \ + || die "H2 static guard FAILED — the DUT is not a recognised durable block FS. Refusing a vacuous H1." + + log "§3.4 calibration: writing an UN-SYNCED marker, then a REAL cut…" + ssh_target "mkdir -p '$H1_WAL_DIR' && '$H1_BIN_DIR/storage-check.sh' probe-write '$H1_WAL_DIR'" + # Give the marker no chance to be flushed by an unrelated sync, then cut hard. + plug_off + sleep "$H1_OFF_SECS" + plug_on + if ! wait_ssh; then + H2_PROBE="FAIL(target did not return after calibration cut)" + die "calibration cut: target did not come back — fix the rig before running cycles." + fi + # Marker MUST be gone. probe-verify exits 0 (gone => real cut) / 1 (survived => vacuous). + if ssh_target "'$H1_BIN_DIR/storage-check.sh' probe-verify '$H1_WAL_DIR'"; then + H2_PROBE="PASS(marker gone)" + pass "§3.4 calibration PASSED — the DUT genuinely loses un-synced data. Cycles are meaningful." + else + H2_PROBE="FAIL(survived)" + die "§3.4 calibration FAILED — the un-synced marker SURVIVED the cut. Storage did NOT lose un-synced data ⇒ any H1 here is VACUOUS. Do NOT run cycles. (Check mount opts / cut fidelity.)" + fi +} + +# --- the cycle loop (§3.5) -------------------------------------------------- +# Globals populated for the evidence ledger. +CYCLES_PASS=0 +FAIL_COUNT=0 +INCONCLUSIVE_RERUN=0 +PER_CYCLE="" # JSON array body, e.g. "0,0,2,0" +append_per_cycle() { PER_CYCLE="${PER_CYCLE:+$PER_CYCLE,}$1"; } + +# Run ONE trial. Echoes nothing; returns: 0 PASS, 1 FAIL, 2 INCONCLUSIVE/infra. +run_one_cycle() { + local cap="$1" + # 1. target up + wait_ssh || return 2 + # 2. fresh WAL dir (independent durability trial; LSN space from 1) + ssh_target "rm -rf '$H1_WAL_DIR' && mkdir -p '$H1_WAL_DIR'" || return 2 + # 3. fresh collector + start_collector "$cap" + # 4. launch the committing workload (network sink; unbounded; batch 64; 64B payload) + if ! ssh_target "nohup '$H1_BIN_DIR/power_pull_workload' '$H1_WAL_DIR' 'tcp:${H1_CONTROLLER_IP}:${H1_PORT}' 0 64 64 >/dev/null 2>&1 & echo started"; then + stop_collector; return 2 + fi + sleep "$H1_WORKLOAD_SECS" # let it commit + stream thousands of acked lines + # 5. CUT (real mains interrupt) — the workload dies with the board + plug_off + # 6. wait, then RESTORE + sleep "$H1_OFF_SECS" + plug_on + stop_collector + # 7. wait for boot + wait_ssh || return 2 + # 8. ship the off-box capture TO the target and verify against the recovered WAL + local remote_cap="/tmp/h1_capture_$$.txt" + scp -q "$cap" "${H1_TARGET_SSH}:${remote_cap}" || return 2 + local rc=0 + ssh_target "'$H1_BIN_DIR/power_pull_verify' '$H1_WAL_DIR' '$remote_cap'" || rc=$? + ssh_target "rm -f '$remote_cap'" 2>/dev/null || true + return "$rc" +} + +cmd_cycle() { + require H1_TARGET_SSH; require H1_WAL_DIR; require H1_CONTROLLER_IP + local capdir; capdir="$(mktemp -d)" + local infra_fail=0 n=0 + log "starting H1 cycle loop — need ${H1_CYCLES} CONSECUTIVE PASS (medium: ${H1_DUT_MEDIUM})." + while [ "$CYCLES_PASS" -lt "$H1_CYCLES" ]; do + n=$(( n + 1 )) + local cap="${capdir}/capture_${n}.txt" + log "cycle attempt #${n} (consecutive PASS so far: ${CYCLES_PASS}/${H1_CYCLES})" + local rc=0 + run_one_cycle "$cap" || rc=$? + case "$rc" in + 0) + CYCLES_PASS=$(( CYCLES_PASS + 1 )); infra_fail=0 + append_per_cycle 0 + pass "cycle #${n}: PASS (${CYCLES_PASS}/${H1_CYCLES})" ;; + 1) + FAIL_COUNT=$(( FAIL_COUNT + 1 )); infra_fail=0 + append_per_cycle 1 + die "cycle #${n}: FAIL — an ACKED LSN was absent after the cut (D1 violation). STOPPING the run. Investigate per §3.6 (most likely a lying device on medium '${H1_DUT_MEDIUM}'; the evidence records which LSN and medium)." ;; + 2) + # INCONCLUSIVE / infra — never counts toward 50; reset the consecutive streak. + INCONCLUSIVE_RERUN=$(( INCONCLUSIVE_RERUN + 1 )) + append_per_cycle 2 + CYCLES_PASS=0 + infra_fail=$(( infra_fail + 1 )) + warn "cycle #${n}: INCONCLUSIVE/infra (side-channel gap or target didn't return) — not counted; consecutive streak reset." + [ "$infra_fail" -lt "$H1_INFRA_FAIL_MAX" ] \ + || die "${H1_INFRA_FAIL_MAX} consecutive infra failures — aborting (likely SD/OS corruption; re-flash, check the read-only overlay & wiring)." ;; + *) + die "cycle #${n}: power_pull_verify exited ${rc} (unexpected) — aborting." ;; + esac + done + rm -rf "$capdir" + pass "H1 cycle loop COMPLETE: ${CYCLES_PASS} consecutive PASS, ${FAIL_COUNT} FAIL, ${INCONCLUSIVE_RERUN} INCONCLUSIVE re-runs." +} + +# --- evidence (§5) ---------------------------------------------------------- +emit_evidence() { + local verdict="$1" + mkdir -p "$(dirname "$WAL_M8_EVIDENCE")" + # Gather target identity (best-effort; never fatal here). + local uname kernel host fstype src + uname="$(ssh_target 'uname -sr' 2>/dev/null || echo unknown)" + kernel="$(ssh_target 'uname -r' 2>/dev/null || echo unknown)" + host="$(ssh_target 'hostname' 2>/dev/null || echo unknown)" + fstype="$(ssh_target "df --output=fstype '$H1_WAL_DIR' 2>/dev/null | tail -1 | tr -d ' '" 2>/dev/null || echo unknown)" + src="$(ssh_target "df --output=source '$H1_WAL_DIR' 2>/dev/null | tail -1 | tr -d ' '" 2>/dev/null || echo unknown)" + WAL_M8_EVIDENCE="$WAL_M8_EVIDENCE" "${REPO_ROOT}/scripts/m8/evidence.sh" emit \ + gate=H1 \ + "target.uname=${uname}" "target.kernel=${kernel}" "target.host=${host}" \ + "storage.fs=${fstype}" "storage.block_device=${src}" \ + "storage.dut_medium=${H1_DUT_MEDIUM}" "storage.h2_probe=${H2_PROBE}" \ + "cut.mechanism=smart-plug mains interrupt (${H1_PLUG_TYPE}@${H1_PLUG_IP:-n/a})" \ + cut.valid=true \ + "run.cycles_required=${H1_CYCLES}" "run.cycles_pass=${CYCLES_PASS}" \ + "run.fail=${FAIL_COUNT}" "run.inconclusive_rerun=${INCONCLUSIVE_RERUN}" \ + "run.per_cycle=@[${PER_CYCLE}]" \ + "verdict=${verdict}" + log "evidence written: $WAL_M8_EVIDENCE" +} + +# --- config dump ------------------------------------------------------------ +cmd_config() { + cat >&2 <} + DUT WAL dir H1_WAL_DIR = ${H1_WAL_DIR:-} + DUT medium H1_DUT_MEDIUM = ${H1_DUT_MEDIUM} + target bin dir H1_BIN_DIR = ${H1_BIN_DIR} + local bin dir H1_LOCAL_BIN_DIR = ${H1_LOCAL_BIN_DIR} + controller IP H1_CONTROLLER_IP = ${H1_CONTROLLER_IP:-} + side-channel port H1_PORT = ${H1_PORT} + plug type H1_PLUG_TYPE = ${H1_PLUG_TYPE} + plug ip H1_PLUG_IP = ${H1_PLUG_IP:-} + plug switch id H1_PLUG_ID = ${H1_PLUG_ID} + plug dry-run H1_PLUG_DRY_RUN = ${H1_PLUG_DRY_RUN} + required PASS H1_CYCLES = ${H1_CYCLES} + workload window H1_WORKLOAD_SECS = ${H1_WORKLOAD_SECS}s + off duration H1_OFF_SECS = ${H1_OFF_SECS}s + boot timeout H1_BOOT_TIMEOUT = ${H1_BOOT_TIMEOUT}s + infra-fail max H1_INFRA_FAIL_MAX= ${H1_INFRA_FAIL_MAX} + evidence out WAL_M8_EVIDENCE = ${WAL_M8_EVIDENCE} + cut URLs: off=$(plug_url off) on=$(plug_url on) +EOF +} + +# --- run (default): calibrate -> cycle -> evidence -------------------------- +# Emit an ABORTED/OPEN ledger on ANY early exit (calibration vacuous, FAIL stop, +# infra abort) — die() uses exit, so this is an EXIT trap, not ERR. A clean PASS +# clears the guard so the trap is a no-op. +H1_DONE=0 +on_exit() { [ "$H1_DONE" = 1 ] && return 0; emit_evidence "ABORTED" || true; } +cmd_run() { + require H1_TARGET_SSH; require H1_WAL_DIR; require H1_CONTROLLER_IP + banner_open "Starting an OWNER-RUN H1 campaign now." + trap on_exit EXIT + # The §3.4 calibration GATE is the FIRST step of every run; a vacuous DUT aborts + # before any cycle counts. + cmd_calibrate + cmd_cycle + emit_evidence "PASS" + H1_DONE=1 + pass "H1 PASSED locally for medium '${H1_DUT_MEDIUM}': ${CYCLES_PASS} consecutive cycles, H2 probe ${H2_PROBE}. The OWNER signs off on #18 — the agent never self-certifies H1." +} + +case "${1:-run}" in + deploy) shift; cmd_deploy ;; + calibrate) shift; cmd_calibrate ;; + cycle) shift; cmd_cycle ;; + config) shift; cmd_config ;; + run) shift; cmd_run ;; + *) die "usage: $0 {deploy|calibrate|cycle|run|config}" ;; +esac From 6204a755664d08a8f4f803bcfe9e5026f2949f68 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Jun 2026 08:10:47 +0000 Subject: [PATCH 2/2] =?UTF-8?q?M8=20H1:=20storage=5Fprobe=20calibration=20?= =?UTF-8?q?bin=20+=20per-path=20verdicts=20(=C2=A714.8=20H1=20/=20D1=20/?= =?UTF-8?q?=20#18)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply the designer's approve-with-conditions feedback on the H1 rig. Delta A — build src/bin/storage_probe.rs as the §3.4 calibration instrument (reverse the shell-probe choice). The "no src/ change" rule guards against durability-logic drift, not against the measurement instrument being real code. The calibration must prove the DUT loses un-synced data on the SAME kernel write path the WAL uses, so the marker is written via a plain write(2) with NO fdatasync (the WAL's data path minus the durability step); a shell echo could differ subtly and mis-measure the very thing this gate exists to catch. It does NOT use the WAL append (pure in-memory until commit — would never reach the cache). Test-only, auto publish-excluded by the existing exclude=["src/bin"], like crash_child. Subcommands write-unsynced-marker (exit 0) and verify-marker-gone (0 = gone/honest, 1 = survived/vacuous). h1-cycle.sh calibrate now uses it for the loss-probe; storage-check.sh classify stays as the deny-by-default FS/cache check. deploy + the workflow cross-compile + scp it alongside power_pull_{workload,verify}. infra-plan §3.2/§3.4 keep the binary as originally written. Delta B — per-path §5 verdict + hard, distinct-code calibration abort. The prior EXIT trap emitted ABORTED for every non-clean exit, mislabeling a D1 FAIL. Replaced with finish()/VERDICT emitting the ledger exactly once on every terminal path, with distinct exit codes: 0=PASS, 1=FAIL (verdict=FAIL), 2=INCONCLUSIVE/ infra (verdict=INCONCLUSIVE), 3=VACUOUS calibration (verdict=OPEN, marker survived — a HARD abort before any cycle counts). The workflow maps 1/2/3 to distinct error annotations. Delta C — runbook makes the calibration-instrument choice and pin-downs 1-4 explicit (verify-on-target-over-ssh + scp transport; per-cycle fresh capture; three-way outcome; evidence-on-every-path), plus an H2-section pointer so the manual shell probe and the storage_probe binary don't read as contradictory. Verified: storage_probe build + cargo +1.85.0 MSRV + fmt/clippy clean and its exit semantics (0/1/2); all four h1-cycle.sh verdict paths driven end-to-end with stubbed ssh/scp/socat (PASS=0, FAIL=1, vacuous=3/verdict=OPEN, unreachable loud-skip), each emitting a valid §5 ledger; shellcheck + actionlint clean; cargo test green (LazyFS/hardware suites remain #[ignore]). H1 stays OPEN-pending-owner-run; the owner triggers m8-h1.yml on the wired rig, observes >=50 PASS with the H2 probe proven + evidence on #18, and closes #18. The agent never self-certifies H1. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01VW9DW3Lu7dVmargSY1gbZk --- .github/workflows/m8-h1.yml | 19 +++-- CLAUDE.md | 2 +- docs/m8-infra-plan.md | 6 +- docs/m8-runbook.md | 37 ++++++++-- scripts/m8/h1-cycle.sh | 84 ++++++++++++++------- src/bin/storage_probe.rs | 143 ++++++++++++++++++++++++++++++++++++ 6 files changed, 250 insertions(+), 41 deletions(-) create mode 100644 src/bin/storage_probe.rs diff --git a/.github/workflows/m8-h1.yml b/.github/workflows/m8-h1.yml index 720dfa8..429f9e3 100644 --- a/.github/workflows/m8-h1.yml +++ b/.github/workflows/m8-h1.yml @@ -97,7 +97,7 @@ jobs: fi echo "available=true" >> "$GITHUB_OUTPUT" - - name: Cross-compile ARM workload/verify bins + - name: Cross-compile ARM bins (workload / verify / storage_probe) if: steps.rig.outputs.available == 'true' env: CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER: aarch64-linux-gnu-gcc @@ -107,23 +107,30 @@ jobs: exit 1 fi cargo build --release --target aarch64-unknown-linux-gnu \ - --bin power_pull_workload --bin power_pull_verify + --bin power_pull_workload --bin power_pull_verify --bin storage_probe - name: Deploy bins + storage-check.sh to the target if: steps.rig.outputs.available == 'true' run: scripts/m8/h1-cycle.sh deploy # The FULL campaign: §3.4 calibration GATE (abort if vacuous) → the ≥N-consecutive - # PASS cycle loop → emit the §5 ledger. h1-cycle.sh enforces every honesty rail. - # rc 0 = N consecutive PASS with H2 proven; anything else = vacuous calibration, - # a D1 FAIL, or an infra abort → red build (NOT a pass). + # PASS cycle loop → emit the §5 ledger. h1-cycle.sh enforces every honesty rail and + # uses DISTINCT exit codes so the cause is unmistakable in CI: + # 0 = N consecutive PASS with H2 proven (verdict=PASS, green) + # 1 = D1 FAIL — an acked LSN was lost (verdict=FAIL; most likely a lying device, §3.6) + # 2 = INCONCLUSIVE / infra abort (verdict=INCONCLUSIVE) + # 3 = VACUOUS calibration — storage didn't lose un-synced data (verdict=OPEN; HARD abort) + # Anything non-zero reds the build (NOT a pass). The verdict is in the ledger. - name: H1 calibration + cycle loop if: steps.rig.outputs.available == 'true' run: | set +e; scripts/m8/h1-cycle.sh run; rc=$?; set -e case "$rc" in 0) echo "H1: ${H1_CYCLES} consecutive PASS with the §3.4 H2 probe proven (medium: ${H1_DUT_MEDIUM}).";; - *) echo "::error title=H1 FAIL/ABORT::h1-cycle.sh run exited ${rc} — read the evidence ledger: vacuous calibration (marker survived), a D1 FAIL (acked LSN lost — most likely a lying device, §3.6), or an infra abort. NOT a pass."; exit 1;; + 1) echo "::error title=H1 D1 FAIL::an ACKED LSN was absent after the cut (verdict=FAIL). Per §3.6 the most likely cause is a lying device on medium '${H1_DUT_MEDIUM}' — the ledger records which LSN. NOT a pass."; exit 1;; + 2) echo "::error title=H1 INCONCLUSIVE::infra abort (target didn't return / side-channel gap, verdict=INCONCLUSIVE). NOT a pass — re-run after fixing the rig."; exit 1;; + 3) echo "::error title=H1 VACUOUS::§3.4 calibration FAILED — storage did NOT lose un-synced data (verdict=OPEN). Any H1 here tests nothing; no cycles ran. Fix the DUT/mount/cut before retrying. NOT a pass."; exit 1;; + *) echo "::error title=H1 unexpected::h1-cycle.sh run exited ${rc} — read the evidence ledger. NOT a pass."; exit 1;; esac - name: Upload evidence ledger diff --git a/CLAUDE.md b/CLAUDE.md index d928113..0221d63 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -109,7 +109,7 @@ The entire value of this component is **correct behavior under crashes and fault ## Project status (keep this updated) -- **LATEST (2026-06-26, branch `claude/gifted-pasteur-gfr9th`): the M8 H1 power-pull RIG is BUILT (owner-runnable; the last open M8 gate).** H1 is the only M8 gate that fundamentally needs physical hardware (a real mains cut on storage that genuinely loses un-synced data, ≥50 consecutive PASS, zero acked-LSN loss = D1). This adds **no `src/` code** — only orchestration + CI + docs around the proven `power_pull_workload`/`power_pull_verify` bins, `storage-check.sh`, and `evidence.sh`. New **`scripts/m8/h1-cycle.sh`** (subcommands `deploy`/`calibrate`/`cycle`/`run`/`config`): cross-built ARM bins + `storage-check.sh` are `scp`'d to the target; the **§3.4 vacuous-pass calibration GATE runs FIRST** (writes an un-synced marker, cuts via the smart plug, asserts the marker is GONE — a *survived* marker ABORTS loudly, no cycle counts); then the cycle loop (fresh WAL → off-box collector → workload over ssh → **mains CUT** → restore → boot-wait → `power_pull_verify` over ssh) repeats until **N consecutive PASS**, with **INCONCLUSIVE never counted** (resets the streak), a **FAIL stopping the run**, and `verdict=PASS` emitted only when `h2_probe` proved loss AND `fail==0`. **Pluggable smart-plug driver** — `H1_PLUG_TYPE`: `shelly` (Gen2/Gen3/Plus RPC `/rpc/Switch.Set?id=&on=`, the **default; the owner's Shelly Plug S Gen3**, aliases `shelly-gen2`/`shelly-gen3`), `shelly-gen1` (`/relay/0?turn=`), `tasmota` (`/cm?cmnd=Power%20`); `H1_PLUG_DRY_RUN=1` for no-hardware dry runs. New **`.github/workflows/m8-h1.yml`** — `workflow_dispatch`-ONLY, `runs-on: [self-hosted, h1-rig]` (the owner's never-cut controller laptop), cross-compiles `aarch64-unknown-linux-gnu`, `deploy`→`run`, uploads the §5 evidence artifact, and posts the ledger to **#18** (dispatch-gated sign-off, matching the dm-flakey/macOS pattern); **loud-skips** (OPEN, not green) if the rig Variables are unset or the target is unreachable. Added **`.github/actionlint.yaml`** declaring the `h1-rig` custom runner label. Docs: runbook H1 "Automated rig" + "Rig setup (target/controller)" subsections (Pi 3 read-only overlay rootfs, dedicated DUT partition, BeagleBone-eMMC + USB-SSD media, controller wiring, smart-plug table); infra-plan §3.2/§3.4 corrected (the marker probe is `storage-check.sh probe-write/probe-verify`, **not** a `storage_probe` binary — honors the no-`src/`-change rule); `scripts/README.md` `h1-cycle.sh` row + owner-run examples. **Verified here:** `shellcheck scripts/m8/h1-cycle.sh` clean, `actionlint` clean on all workflows, `bash -n`, plug-URL + config + §5-evidence dry-runs, `cargo build` of the two bins. **CANNOT run here** (no cuttable target / smart plug): the actual cycles — the script + workflow print loud OPEN banners and never fake green. **H1 stays OPEN-pending-owner-run: the OWNER triggers `m8-h1.yml` on the wired rig, observes ≥50 PASS with the H2 probe proven + evidence on #18, and closes #18. The agent never self-certifies H1.** +- **LATEST (2026-06-26, branch `claude/gifted-pasteur-gfr9th`): the M8 H1 power-pull RIG is BUILT (owner-runnable; the last open M8 gate).** H1 is the only M8 gate that fundamentally needs physical hardware (a real mains cut on storage that genuinely loses un-synced data, ≥50 consecutive PASS, zero acked-LSN loss = D1). It adds **no durability `src/` code** — one **test-only bin** (`src/bin/storage_probe.rs`, the §3.4 calibration instrument; publish-excluded by the existing `exclude=["src/bin"]`, like `crash_child`) + orchestration + CI + docs around the proven `power_pull_workload`/`power_pull_verify` bins, `storage-check.sh`, and `evidence.sh`. **`storage_probe`** (`write-unsynced-marker`/`verify-marker-gone`) is a deliberate `write(2)`-**without-`fdatasync`** — the WAL's data path minus the sync — so the calibration measures loss on the **same kernel write path the WAL uses** (a shell `echo` could mis-measure; this was the designer's key correction, reversing the first cut's shell-probe choice). New **`scripts/m8/h1-cycle.sh`** (subcommands `deploy`/`calibrate`/`cycle`/`run`/`config`): cross-built ARM bins (incl. `storage_probe`) + `storage-check.sh` are `scp`'d to the target; the **§3.4 vacuous-pass calibration GATE runs FIRST** (`storage-check.sh classify` FS/cache deny-check, then `storage_probe` writes an un-synced marker, smart-plug CUT, asserts the marker is GONE — a *survived* marker is a **HARD abort**, exit 3, no cycle counts); then the cycle loop (fresh WAL → fresh per-cycle off-box collector capture → workload over ssh → **mains CUT** → restore → boot-wait → `scp` that cycle's capture to the target → `power_pull_verify` over ssh) repeats until **N consecutive PASS**, with **INCONCLUSIVE never counted** (resets the streak), a **FAIL stopping the run**. **Per-path §5 verdict on EVERY exit** (fixes a first-cut bug where the EXIT trap mislabeled a D1 FAIL as ABORTED): distinct exit codes **0=PASS / 1=FAIL / 2=INCONCLUSIVE / 3=VACUOUS(verdict=OPEN)**; `verdict=PASS` only when `h2_probe` proved loss AND `fail==0`. **Pluggable smart-plug driver** — `H1_PLUG_TYPE`: `shelly` (Gen2/Gen3/Plus RPC `/rpc/Switch.Set?id=&on=`, the **default; the owner's Shelly Plug S Gen3**, aliases `shelly-gen2`/`shelly-gen3`), `shelly-gen1` (`/relay/0?turn=`), `tasmota` (`/cm?cmnd=Power%20`); `H1_PLUG_DRY_RUN=1` for no-hardware dry runs. New **`.github/workflows/m8-h1.yml`** — `workflow_dispatch`-ONLY, `runs-on: [self-hosted, h1-rig]` (the owner's never-cut controller laptop), cross-compiles `aarch64-unknown-linux-gnu` (all three bins), `deploy`→`run` (maps exit 1/2/3 to distinct errors), uploads the §5 evidence artifact, and posts the ledger to **#18** (dispatch-gated sign-off, matching the dm-flakey/macOS pattern); **loud-skips** (OPEN, not green) if the rig Variables are unset or the target is unreachable. Added **`.github/actionlint.yaml`** declaring the `h1-rig` custom runner label. Docs: runbook H1 "Automated rig" + "Rig setup (target/controller)" subsections (Pi 3 read-only overlay rootfs, dedicated DUT partition, BeagleBone-eMMC + USB-SSD media, controller wiring, smart-plug table, calibration-instrument note, pin-downs); infra-plan §3.2/§3.4 keep the `storage_probe` **binary** (the calibration instrument, *because* it shares the WAL write path); `scripts/README.md` `h1-cycle.sh` row + owner-run examples. **Verified here:** `storage_probe` build + `cargo +1.85.0` MSRV + `fmt`/`clippy` clean and its write/verify exit semantics (0 gone / 1 survived / 2 usage); `shellcheck`/`bash -n` clean; `actionlint` clean on all workflows; **all four verdict paths driven end-to-end with stubbed ssh/scp/socat — PASS(0)/FAIL(1)/vacuous(3,verdict=OPEN) + the unreachable-rig loud-skip — each emitting a valid §5 ledger**; plug-URL + config dry-runs. **CANNOT run here** (no cuttable target / smart plug): the actual cycles — the script + workflow print loud OPEN banners and never fake green. **H1 stays OPEN-pending-owner-run: the OWNER triggers `m8-h1.yml` on the wired rig, observes ≥50 PASS with the H2 probe proven + evidence on #18, and closes #18. The agent never self-certifies H1.** - **(2026-06-25, PRs #20 + #21 off `main`): dm-flakey CI now RUNS, H3-physical PASSES, §14.4d is three-tier.** PR #20 (`claude/m8-dmflakey-ci-fixes`) fixes the hosted dm-flakey gate: provision `linux-modules-extra-$(uname -r)` + `modprobe dm_flakey` (dm-flakey **is** reachable on hosted Azure runners — no self-hosted runner needed), `cmd_check` queries `dmsetup targets` **as root**, and dm table reloads use `dmsetup suspend --noflush --nolockfs` in **both** `flakey_fault` and `flakey_up` (a default suspend's lockfs **freeze** is a full fs-sync that either EIO'd through the erroring target — misread as a §12 violation — or persisted the un-synced data before the drop, defeating the §14.4d controls). **Result: H3-physical ext4 PASSES** (source-confirmed block-layer EIO → §12 poison; evidence on issue #16). PR #21 (`claude/m8-dirfsync-tiers`, stacked on #20) resolves §14.4d per the designer: **the dir-fsync omission is NOT reproducible on ext4/xfs/btrfs** — those journaling FSes transitively persist a new file's dir entry on the segment's own `fsync` (AFSNCE OSDI '14, §18), masking it; `fsync_dir` is kept as a portable-durability safeguard. Three tiers: **Tier-1 (PRIMARY, per-PR, deterministic) = `scripts/m8/dirfsync-presence.sh`** straces the roll path, asserts correct issues the roll-time dir-`fsync` while `inject_no_dir_fsync` does not — **RUN+green here** (`correct=5` vs `inject=1`), wired into `ci.yml`; **Tier-2 = behavioral power-loss via a synchronized mid-run cut** (`src/bin/dirfsync_cut_workload.rs` rolls once, acks a record into the new segment, blocks with the dirent dirty; harness activates `drop_writes` *before* kill/umount, fsck, remount, verify) — **CLOSED as a DOCUMENTED NEGATIVE RESULT (PR #21, owner Fedora 43):** the inject build recovers fully on EVERY config tested — ext4/xfs/btrfs, journal-less ext4 (incl. `ext2`-format), and the last attempt, journaled ext4 `data=writeback` (the ext4 driver's weakest ordering; `data=writeback` weakens data ordering, not the metadata/dirent). The dirent reaches disk transitively via the file's own `fdatasync` everywhere. **Mechanism correction:** the earlier "ext2 block-adjacency" claim is RETRACTED — dmesg shows `ext2`-format is serviced by the **ext4 driver journal-less** on modern kernels (standalone ext2 driver removed in Linux 6.9); mechanism not isolated. No readily-available Linux FS exposes it behaviorally ⇒ honest negative result, not a gap. Tier-1 strace carries the DoD; `fsync_dir` retained as a POSIX-portability safeguard. (Note: `data=writeback` requires a journal — NOT combinable with `-O ^has_journal`.); **Tier-3 = ext4/xfs/btrfs INCONCLUSIVE-by-design** (informational, never red on a masked miss, still red on a correct-build data loss). dm-flakey harness also got `wipefs`/zero-before-mkfs + `udevadm settle` + `dmsetup remove --retry/-f --deferred` (fixes the back-to-back "device busy"). Docs corrected (design §14.4d note + §14.13 row, runbook three-tier, this block). `shellcheck`+`cargo fmt --check` clean; the strace gate is self-verified green. **§14.4d behavioral (Tier-2) is now CLOSED as a documented negative result** (Tier-1 satisfies the DoD). **Still owner/CI to observe:** H1 power-pull. - **Current milestone:** M8 (hardware/platform durability, §14.8 + the deferred §14.4d) — **harnesses + runbook BUILT; the runnable-here pieces are RUN+green; the physical gates are honestly OPEN-pending-owner-run, never self-certified from this sandbox.** **What RUNS green here:** **H2** the deny-by-default storage durability guard (`scripts/m8/storage-check.sh` — passes on the repo's ext4, FAILs on tmpfs; the vacuous-pass guard H1 depends on, rejecting tmpfs/overlay/unrecognized FS); and the **H3 §12 poison *state machine*** (`scripts/m8/fsync-fault.sh` + `tests/fsync_fault_gate.rs`, 3 tests) — an `LD_PRELOAD` shim (`tests/fault/eio_preload.c`) returns EIO from the commit's libc `fdatasync` and the gate asserts `FsyncFailed`, **no `durable_lsn` advance past the synced segment** (incl. the split-batch **rest-at-seg1-max** partial advance), and handle **poison** (subsequent ops `Poisoned`), with an **anti-vacuous guard** that the injection actually fired (the gate **fails loudly** if run without the shim — demonstrated). Shim interception was **empirically proven the ship/drop gate** (`strace`: 6 `fdatasync` all intercepted, 3 `fsync` — the rustix raw-syscall dir-fsync — none ⇒ the shim bounds to the data-sync poison path; the dir-fsync poison + §14.4d stay dm-flakey-only). **What is OPEN-pending-owner-run** (this sandbox's kernel has **no `CONFIG_BLK_DEV_DM`/`/lib/modules`/`/dev/mapper/control`**, no cuttable target, and is Linux not macOS): **H3 physical** (`scripts/m8/dm-flakey.sh h3` — `error_writes` → block-layer EIO → poison, workload exit 7); **§14.4d** (`dm-flakey.sh dirfsync-negative` — correct vs `--features inject_no_dir_fsync` across a `drop_writes` power loss; **now three-tier — see the LATEST bullet at the top: Tier-1 strace presence PASSES per-PR, Tier-2 certifies on ext2 not ext4, ext4/xfs/btrfs are INCONCLUSIVE-by-design**); **H1** power-pull (`src/bin/power_pull_{workload,verify}.rs` + `scripts/m8/power-pull.sh`, ≥50 cycles zero acked loss — off-box **network** side channel, **send-strictly-after-`commit() Ok`** ack-ordering, **contiguous-watermark** conservative verify, **H2-gated**; the full chain was **dry-run green on loopback** and the verifier's falsifiability shown: simulated acked-loss → FAIL/D1, side-channel gap → INCONCLUSIVE); (H4 was the macOS-tier item — **now CLOSED, see next sentence**). Every OPEN gate prints a loud "NOT EXERCISED"/OPEN banner (mirrors the LazyFS stopgap) — **no fake green.** **H4 macOS `F_FULLFSYNC` — VERIFIED & CLOSED (owner-run):** ran on owner macOS hardware (Mac mini, SIP-enabled, 2026-06-25) — `tests/macos_fullfsync.rs` smoke green **and** the `#[ignore]`d `dtruss -t fcntl` proof shows `F_FULLFSYNC` (cmd `0x33`) issued twice on the durable path, both succeeding. A matcher fix (commit `ba2b84d`) reads dtruss's **numeric** fcntl command, since SIP-enabled dtruss does not symbolize `F_FULLFSYNC` by name. Owner procedure in `docs/m8-runbook.md`. No `src/` contract change (harnesses + tests + bins + docs only; two `power_pull` bins added `WAL_SEGMENT_SIZE`/`WAL_MAX_RECORD_SIZE` env overrides for the §14.4d roll forcing). **M8 test-automation (Tier 1 + Tier 3):** added `scripts/m8/evidence.sh` (shared §5 evidence-ledger emitter), enhanced `dm-flakey.sh` with the amended anti-vacuous criteria (#16 PASS now ANDs WAL poison with a **source-confirmed block-layer EIO** scraped from `dmesg` in the injection window, + bounded retry; #17 bounded retry budget **plus a `drop_writes` positive control** — if drop_writes is inert the negative control is non-functional ⇒ exit 4 HARNESS, louder than a timing INCONCLUSIVE; INCONCLUSIVE≠PASS; verdict exit codes 0/1/2/3/4) + evidence emission (incl. `block_layer_eio_observed`, `dmesg_readable`, `drop_positive_control`), and added `.github/workflows/m8-dmflakey.yml` (**push-to-main** paths-filtered + nightly + dispatch; **H3-physical #16** + **§14.4d #17** on hosted ubuntu VMs that reach `dm-flakey` — ext4 hard, xfs/btrfs informational, best-effort + loud skip — a green loud-skip is **not** a passed gate) and `m8-macos.yml` (`macos-latest`; **H4 Half A #19** routing/smoke; **per-PR paths-filtered + push-to-main + dispatch**, since a macOS-only `F_FULLFSYNC`-routing regression is invisible to Linux PR CI). Both upload the §5 artifact every run and post to the tracking issue **only on `workflow_dispatch`** (human sign-off; nightly stays artifact-only + red-build). `m8.yml`'s contradictory "physical-availability" job was trimmed to a pointer. The genuine proof is each workflow's first green run; until observed the dm-flakey gates are **contingent** (and loud-skip if a runner lacks the target). `cargo test`, `cargo clippy --all-targets -D warnings`, `cargo fmt --check` green; `shellcheck` + `actionlint` clean on the new scripts/workflows. **Still NOT done:** **H1** power-pull (owner-run, the only gate still needing a cuttable target) + the §14.4d/H3-physical first-green CI observation; **H4 done** (verified on owner macOS; Half A now also in CI, Half B owner-run); fuzzing (F1–F4)/Miri/soak (M9). - **M7** (performance: criterion benches + regression gates + zero-alloc, §14.7) — **COMPLETE, with the regression-gate CI *enforcement* tracked OPEN-pending-controlled-runner (honest, not "done")**. New `benches/wal.rs`: four criterion groups over the public API against a **real `fdatasync`** (never mocked) — `throughput` (64 B/256 B/4 KiB/64 KiB, `Throughput::Bytes`), `commit_latency` (batch 1/8/64/512/4096; 8 MiB segment so even batch=4096 never rolls — pure group-commit amortization), `recovery` (`Wal::open` vs log size + segment count), `split_batch` (spanning vs not — quantifies the extra fsync). Fixtures built **outside** the measured closure (`iter_batched*`/`iter_custom` setup; fresh WAL per iter so the log can't grow into a roll). **Tail percentiles:** criterion reports only mean/median (no arbitrary percentiles), so `commit_latency` records per-iter timings into an `hdrhistogram` and emits **p50/p99/p999** itself, persisted to `target/perf/commit_latency_.json`. **Zero-alloc gate hardened** (`tests/zero_alloc.rs`): now *proves* the measured window did not roll (segment-file count + `durable_lsn` advance, both read outside the counted region — no segment accessor needed) and adds a `max_record_size` (256 B) payload variant; kept `SERIAL` + warm-up; 3 tests green. **Regression gate** `scripts/perf-gate.sh` (`baseline`/`compare`/`check`/`inspect`): throughput/**median-time** delta from criterion `estimates.json` (the **median** point estimate — not the outlier-sensitive mean), **p999** delta from the histogram JSON; thresholds **>10%** time / **>20%** p999 (tunable); needs `python3`. **Falsifiability (§14.0.3) demonstrated:** an 800 µs sleep injected into the timed `commit()` window tripped the gate (median-time +160%+ on small batches *and* a p999 breach, `check` exit 1), then reverted. **CI tiering (§14.11):** per-PR `ci.yml` gains `cargo bench --no-run` (benches can't bitrot) and already runs the zero-alloc gate via `cargo test` (both enforced); new `.github/workflows/bench.yml` runs the benches + gate **nightly/manual, informational** (`continue-on-error`, uploads `target/criterion`+`target/perf` artifacts) — same stopgap as the LazyFS gate; the >10%/>20% thresholds stay a real gate on a controlled/pinned-governor runner. **No `src/` change** (benches drive the public API; no steady-state alloc or perf bug found to flag). MSRV (1.85) re-verified with the new dev-deps (`criterion` default-features-off, `hdrhistogram`) via `cargo +1.85.0 check --all-targets --locked`. Docs: §14.7 M7 block, §14.11 split, §14.13 zero-alloc row, v6.1 changelog, `scripts/README.md` perf-gate section. `cargo test` / `cargo clippy --all-targets -D warnings` / `cargo fmt --check` all green. **Still NOT done (M8+):** §14.8 hardware durability incl. the §14.4d metadata-fault negative control (M8); fuzzing (F1–F4)/Miri/soak (M9); the perf-gate's enforced CI run on a controlled runner. diff --git a/docs/m8-infra-plan.md b/docs/m8-infra-plan.md index ce6204f..6db8239 100644 --- a/docs/m8-infra-plan.md +++ b/docs/m8-infra-plan.md @@ -78,7 +78,7 @@ Why this split: the controller hosts the GitHub runner and must never be cut (a - **Dedicated writable WAL partition = the DUT.** The only thing exposed to power-cut writes is the WAL under test, on its own ext4 partition. Provide for **three DUT media**, each a different device class: (a) a partition on the **microSD** (Pi), (b) the **USB-SSD** (Pi), (c) the **onboard eMMC** of the **BeagleBone Black**. Run the gate against each; record which. (eMMC is soldered, managed NAND with its own controller/cache — the most production-realistic embedded medium and the one whose flush honesty is most likely to surprise you, so it broadens the device-honesty coverage the rig produces.) - For the BeagleBone target, boot its **rootfs from microSD with the read-only overlay** (same cut-corruption protection as the Pi) and put the **WAL on a dedicated eMMC partition** as the DUT — so the eMMC is the thing under test and the OS isn't the thing 50 cuts corrupt. The BBB is ARMv7/battery-less with a single 5 V input, so it's a valid smart-plug cut target; it is **not** a controller candidate (32-bit → no GitHub runner), only a DUT. - **DUT media are consumables.** The boards themselves shrug off power cycling (effectively unlimited at these counts). The flash media is the wear surface, and the binding risk is **not** write-endurance — 50 cycles write only single-digit GB, orders of magnitude under any card's lifetime — but **sudden FTL (flash-translation-layer) corruption on a mid-write cut**, which can brick a whole card. The per-cut probability is low and *device-quality-dependent* (a cheap no-name microSD is the most exposed; the read-only overlay already protects the boot/OS card regardless). So: keep **one or two spares of each DUT medium**, keep a **pre-imaged OS/boot card** so a brick is a 5-minute re-flash rather than a re-setup, and treat a **mid-campaign card death as a recordable device-honesty finding** (that card is empirically not honest hardware — exactly the verdict the gate exists to render), not a rig failure. Record it in the evidence artifact and continue on a spare. -- **Cross-compiled binaries**, built on the controller (`aarch64-unknown-linux-gnu`, MSRV 1.85, via `cross` or rustup target + linker) and `scp`'d to the Pi: `power_pull_workload` and `power_pull_verify`. The marker probe is the existing **`scripts/m8/storage-check.sh`** (`probe-write`/`probe-verify`, shell — no separate `storage_probe` binary needed), deployed alongside them. Do not build on the Pi (slow, and we want the runner to control versions). `scripts/m8/h1-cycle.sh deploy` performs this `scp`. +- **Cross-compiled binaries**, built on the controller (`aarch64-unknown-linux-gnu`, MSRV 1.85, via `cross` or rustup target + linker) and `scp`'d to the Pi: `power_pull_workload`, `power_pull_verify`, and `storage_probe`. Do not build on the Pi (slow, and we want the runner to control versions). `scripts/m8/h1-cycle.sh deploy` performs this `scp` (plus `storage-check.sh` for the static FS/cache classification). - Passwordless ssh from controller → Pi (key auth) for unattended cycling. ### 3.3 Controller setup @@ -92,9 +92,9 @@ Why this split: the controller hosts the GitHub runner and must never be cut (a Do **not** run the 50-cycle loop until this passes. On the exact DUT medium (this is the `h1-cycle.sh calibrate` step, run automatically as the first step of every `run`): -1. `storage-check.sh probe-write ` — write a marker **without** `fdatasync` (it must sit in the page cache / device cache, not on stable flash). +1. `storage_probe write-unsynced-marker ` — write a marker **without** `fdatasync` (it must sit in the page cache / device cache, not on stable flash). `storage_probe` writes via the **same `write(2)` path the WAL uses** (minus the sync), so "un-synced data lost here" predicts "un-acked WAL record lost here" — a shell `echo` could differ subtly and mis-measure. 2. Cut power via the smart plug; restore; wait for boot. -3. `storage-check.sh probe-verify ` — the marker **MUST be absent** (exit 0 = gone; exit 1 = survived). +3. `storage_probe verify-marker-gone ` — the marker **MUST be absent** (exit 0 = gone; exit 1 = survived ⇒ vacuous, hard abort). - **Gone ⇒ the cut is real** (un-synced data is genuinely lost) ⇒ proceed. - **Survives ⇒ vacuous** (storage didn't lose un-synced data — e.g. mounted `sync`, or the probe accidentally flushed) ⇒ **abort, fail loudly**, do not run cycles. Investigate the mount / probe before continuing. diff --git a/docs/m8-runbook.md b/docs/m8-runbook.md index 8c46c2e..9537a74 100644 --- a/docs/m8-runbook.md +++ b/docs/m8-runbook.md @@ -101,6 +101,12 @@ If the marker **survives**, the storage does not lose un-synced data — **STOP* H1 result on it would be vacuous. Label such targets "PLP-cache only" and do not make a durability claim for honest power loss on them. +> The **automated** H1 rig runs this same probe as `storage_probe write-unsynced-marker` +> / `verify-marker-gone` (a test-only binary that writes via the **WAL's own `write(2)` +> path**, minus the sync — see "Automated rig" under H1). That is the authoritative +> calibration instrument; the shell `storage-check.sh probe-*` above is the quick manual +> equivalent. `storage-check.sh classify` remains the static FS/cache deny-by-default check. + --- ## H3 — fsync-failure → poison (§12) @@ -279,19 +285,40 @@ device, cache mode, and cut mechanism. The manual loop above is fully automated by **`scripts/m8/h1-cycle.sh`** (orchestration on the controller) and **`.github/workflows/m8-h1.yml`** (`workflow_dispatch`-only, on a -`[self-hosted, h1-rig]` runner). The script reuses the same binaries/scripts — it adds no -`src/` code — and bakes in every honesty rail: the **§3.4 calibration gate runs first**, -INCONCLUSIVE never counts, a FAIL stops the run, and `verdict=PASS` is emitted only when the -H2 probe proved loss **and** `fail==0`. +`[self-hosted, h1-rig]` runner). It reuses the proven `power_pull_*` binaries and adds one +test-only bin (`storage_probe`, below); it touches **no durability `src/` code**. Every +honesty rail is baked in: the **§3.4 calibration gate runs first**, INCONCLUSIVE never counts, +a FAIL stops the run, and `verdict=PASS` is emitted only when the H2 probe proved loss **and** +`fail==0`. ```bash scripts/m8/h1-cycle.sh config # print the resolved config (touches no hardware) -scripts/m8/h1-cycle.sh deploy # scp the aarch64 bins + storage-check.sh to the target +scripts/m8/h1-cycle.sh deploy # scp the aarch64 bins (incl. storage_probe) + storage-check.sh scripts/m8/h1-cycle.sh calibrate # §3.4 vacuous-pass GATE (real cut; marker MUST be gone) scripts/m8/h1-cycle.sh cycle # the ≥N-consecutive-PASS loop scripts/m8/h1-cycle.sh run # (default) calibrate → cycle → emit §5 evidence ``` +**The calibration instrument is the `storage_probe` binary, not a shell `echo`.** §3.4 must +prove the DUT loses un-synced data **on the same kernel write path the WAL uses**, so the +marker is written by `storage_probe write-unsynced-marker` — a plain `write(2)` with **no +`fdatasync`** (the WAL's data path minus the durability step). "Un-synced data lost here" then +predicts "un-acked WAL record lost here." `storage-check.sh classify` still runs first as the +deny-by-default FS/cache check, but the loss-probe is the binary. `verify-marker-gone` exits +0 = gone (honest cut) / 1 = survived (vacuous). A surviving marker is a **HARD abort** (exit 3, +`verdict=OPEN`, `h2_probe=FAIL(survived)`) — no cycle counts. + +**Distinct exit codes / verdicts** (each terminal cause is unmistakable, and the §5 ledger is +written on **every** path): `0`=PASS, `1`=D1 FAIL (`verdict=FAIL`), `2`=INCONCLUSIVE/infra +abort (`verdict=INCONCLUSIVE`), `3`=vacuous calibration (`verdict=OPEN`). + +**Loop guarantees (pin-downs).** `power_pull_verify` runs **on the target over ssh** against the +recovered WAL; the controller `scp`s **that cycle's fresh capture** to the target first, after the +post-reboot ssh-readiness wait. The collector capture is **per-cycle** (a fresh file), so a stale +capture can't bleed into a later cycle. Each cycle resolves to exactly one of PASS (counts) / +INCONCLUSIVE-or-infra (re-run, never counted; `H1_INFRA_FAIL_MAX` consecutive infra fails abort +loudly — likely SD/OS corruption) / FAIL (stops the run). + Config is via env (see `config` for the full list): `H1_TARGET_SSH`, `H1_WAL_DIR` (the DUT partition), `H1_DUT_MEDIUM`, `H1_CONTROLLER_IP`, `H1_PORT` (9099), `H1_PLUG_TYPE`/`H1_PLUG_IP`, `H1_CYCLES` (50), `H1_WORKLOAD_SECS`, `H1_OFF_SECS`, `H1_BOOT_TIMEOUT`, `H1_INFRA_FAIL_MAX`. diff --git a/scripts/m8/h1-cycle.sh b/scripts/m8/h1-cycle.sh index 9c8608a..9dfa4d3 100644 --- a/scripts/m8/h1-cycle.sh +++ b/scripts/m8/h1-cycle.sh @@ -62,7 +62,25 @@ WAL_M8_EVIDENCE="${WAL_M8_EVIDENCE:-${REPO_ROOT}/m8-evidence/evidence-h1.json}" log() { printf '\033[1;34m[m8/h1]\033[0m %s\n' "$*" >&2; } warn() { printf '\033[1;33m[m8/h1] WARN:\033[0m %s\n' "$*" >&2; } pass() { printf '\033[1;32m[m8/h1] PASS:\033[0m %s\n' "$*" >&2; } -die() { printf '\033[1;31m[m8/h1] ERROR:\033[0m %s\n' "$*" >&2; exit 1; } +# die_code : distinct exit codes make each terminal cause +# unmistakable in CI. 1 = D1 FAIL (acked loss), 2 = INCONCLUSIVE/infra, 3 = VACUOUS +# calibration (the loudest — storage did not lose un-synced data). die() = generic 1. +die_code() { local c="$1"; shift; printf '\033[1;31m[m8/h1] ERROR:\033[0m %s\n' "$*" >&2; exit "$c"; } +die() { die_code 1 "$@"; } + +# --- §5 verdict (one ledger per run, on EVERY terminal path) ----------------- +# The §5 schema verdict is PASS | FAIL | INCONCLUSIVE | OPEN. We set VERDICT at each +# terminal path and emit exactly once via finish(); the EXIT trap (cmd_run) is only a +# safety net that emits the current VERDICT (default OPEN ⇒ "gate did not complete"). +VERDICT="OPEN" +EVIDENCE_EMITTED=0 +finish() { # finish : emit the §5 ledger exactly once + VERDICT="$1" + [ "$EVIDENCE_EMITTED" = 1 ] && return 0 + EVIDENCE_EMITTED=1 + emit_evidence "$VERDICT" +} +on_exit() { finish "$VERDICT"; } # safety net for any unhandled exit ⇒ OPEN # Loud OPEN banner — H1 is owner-run; this never fakes green. banner_open() { @@ -168,14 +186,15 @@ cmd_deploy() { require H1_TARGET_SSH local wbin="${H1_LOCAL_BIN_DIR}/power_pull_workload" local vbin="${H1_LOCAL_BIN_DIR}/power_pull_verify" - if [ ! -x "$wbin" ] || [ ! -x "$vbin" ]; then - die "cross-built bins not found in $H1_LOCAL_BIN_DIR (build for aarch64 first; see the runbook)." + local pbin="${H1_LOCAL_BIN_DIR}/storage_probe" + if [ ! -x "$wbin" ] || [ ! -x "$vbin" ] || [ ! -x "$pbin" ]; then + die "cross-built bins not found in $H1_LOCAL_BIN_DIR (build power_pull_workload/power_pull_verify/storage_probe for aarch64 first; see the runbook)." fi log "deploying bins + storage-check.sh to ${H1_TARGET_SSH}:${H1_BIN_DIR}" ssh_target "mkdir -p '$H1_BIN_DIR'" - scp -q "$wbin" "$vbin" "${REPO_ROOT}/scripts/m8/storage-check.sh" \ + scp -q "$wbin" "$vbin" "$pbin" "${REPO_ROOT}/scripts/m8/storage-check.sh" \ "${H1_TARGET_SSH}:${H1_BIN_DIR}/" - ssh_target "chmod +x '$H1_BIN_DIR/power_pull_workload' '$H1_BIN_DIR/power_pull_verify' '$H1_BIN_DIR/storage-check.sh'" + ssh_target "chmod +x '$H1_BIN_DIR/power_pull_workload' '$H1_BIN_DIR/power_pull_verify' '$H1_BIN_DIR/storage_probe' '$H1_BIN_DIR/storage-check.sh'" pass "deployed to ${H1_BIN_DIR}" } @@ -186,27 +205,39 @@ cmd_deploy() { H2_PROBE="not-run" cmd_calibrate() { require H1_TARGET_SSH; require H1_WAL_DIR + # Static deny-by-default FS/cache check (storage-check.sh). A non-durable FS is a + # vacuous-class abort (exit 3) just like a surviving marker. log "§3.4 calibration: static H2 classification of the DUT…" - ssh_target "'$H1_BIN_DIR/storage-check.sh' classify '$H1_WAL_DIR'" \ - || die "H2 static guard FAILED — the DUT is not a recognised durable block FS. Refusing a vacuous H1." + if ! ssh_target "'$H1_BIN_DIR/storage-check.sh' classify '$H1_WAL_DIR'"; then + H2_PROBE="FAIL(non-durable FS)" + finish "OPEN" + die_code 3 "H2 static guard FAILED — the DUT is not a recognised durable block FS. Refusing a vacuous H1." + fi - log "§3.4 calibration: writing an UN-SYNCED marker, then a REAL cut…" - ssh_target "mkdir -p '$H1_WAL_DIR' && '$H1_BIN_DIR/storage-check.sh' probe-write '$H1_WAL_DIR'" - # Give the marker no chance to be flushed by an unrelated sync, then cut hard. + # Empirical loss probe via storage_probe — shares the WAL write(2) path (the reason + # it's a binary, not the shell echo): un-synced data that is lost here predicts an + # un-acked WAL record lost here. + log "§3.4 calibration: writing an UN-SYNCED marker (WAL write path), then a REAL cut…" + ssh_target "mkdir -p '$H1_WAL_DIR' && '$H1_BIN_DIR/storage_probe' write-unsynced-marker '$H1_WAL_DIR'" + # Cut immediately (no chance for an unrelated writeback to flush the marker). plug_off sleep "$H1_OFF_SECS" plug_on if ! wait_ssh; then H2_PROBE="FAIL(target did not return after calibration cut)" - die "calibration cut: target did not come back — fix the rig before running cycles." + finish "INCONCLUSIVE" + die_code 2 "calibration cut: target did not come back — fix the rig before running cycles." fi - # Marker MUST be gone. probe-verify exits 0 (gone => real cut) / 1 (survived => vacuous). - if ssh_target "'$H1_BIN_DIR/storage-check.sh' probe-verify '$H1_WAL_DIR'"; then + # Marker MUST be gone. storage_probe exits 0 (gone ⇒ honest cut) / 1 (survived ⇒ vacuous). + if ssh_target "'$H1_BIN_DIR/storage_probe' verify-marker-gone '$H1_WAL_DIR'"; then H2_PROBE="PASS(marker gone)" pass "§3.4 calibration PASSED — the DUT genuinely loses un-synced data. Cycles are meaningful." else + # HARD abort — the single most important thing this gate can discover. Distinct + # exit 3, evidence emitted with h2_probe=FAIL(survived) and verdict=OPEN. H2_PROBE="FAIL(survived)" - die "§3.4 calibration FAILED — the un-synced marker SURVIVED the cut. Storage did NOT lose un-synced data ⇒ any H1 here is VACUOUS. Do NOT run cycles. (Check mount opts / cut fidelity.)" + finish "OPEN" + die_code 3 "§3.4 calibration FAILED (VACUOUS) — the un-synced marker SURVIVED the cut. Storage did NOT lose un-synced data ⇒ any H1 here tests nothing. NO cycles run. (Check mount opts / cut fidelity.)" fi } @@ -268,7 +299,8 @@ cmd_cycle() { 1) FAIL_COUNT=$(( FAIL_COUNT + 1 )); infra_fail=0 append_per_cycle 1 - die "cycle #${n}: FAIL — an ACKED LSN was absent after the cut (D1 violation). STOPPING the run. Investigate per §3.6 (most likely a lying device on medium '${H1_DUT_MEDIUM}'; the evidence records which LSN and medium)." ;; + finish "FAIL" + die_code 1 "cycle #${n}: FAIL — an ACKED LSN was absent after the cut (D1 violation). STOPPING the run. Investigate per §3.6 (most likely a lying device on medium '${H1_DUT_MEDIUM}'; the evidence records which LSN and medium)." ;; 2) # INCONCLUSIVE / infra — never counts toward 50; reset the consecutive streak. INCONCLUSIVE_RERUN=$(( INCONCLUSIVE_RERUN + 1 )) @@ -276,10 +308,13 @@ cmd_cycle() { CYCLES_PASS=0 infra_fail=$(( infra_fail + 1 )) warn "cycle #${n}: INCONCLUSIVE/infra (side-channel gap or target didn't return) — not counted; consecutive streak reset." - [ "$infra_fail" -lt "$H1_INFRA_FAIL_MAX" ] \ - || die "${H1_INFRA_FAIL_MAX} consecutive infra failures — aborting (likely SD/OS corruption; re-flash, check the read-only overlay & wiring)." ;; + if [ "$infra_fail" -ge "$H1_INFRA_FAIL_MAX" ]; then + finish "INCONCLUSIVE" + die_code 2 "${H1_INFRA_FAIL_MAX} consecutive infra failures — aborting (likely SD/OS corruption; re-flash, check the read-only overlay & wiring)." + fi ;; *) - die "cycle #${n}: power_pull_verify exited ${rc} (unexpected) — aborting." ;; + finish "INCONCLUSIVE" + die_code 2 "cycle #${n}: power_pull_verify exited ${rc} (unexpected) — aborting." ;; esac done rm -rf "$capdir" @@ -337,21 +372,18 @@ EOF } # --- run (default): calibrate -> cycle -> evidence -------------------------- -# Emit an ABORTED/OPEN ledger on ANY early exit (calibration vacuous, FAIL stop, -# infra abort) — die() uses exit, so this is an EXIT trap, not ERR. A clean PASS -# clears the guard so the trap is a no-op. -H1_DONE=0 -on_exit() { [ "$H1_DONE" = 1 ] && return 0; emit_evidence "ABORTED" || true; } +# finish() emits the §5 ledger exactly once on every terminal path; the EXIT trap is +# only a safety net (emits the current VERDICT, default OPEN, on an unhandled exit). +# The terminal failures inside calibrate/cycle already set the right verdict + exit code. cmd_run() { require H1_TARGET_SSH; require H1_WAL_DIR; require H1_CONTROLLER_IP banner_open "Starting an OWNER-RUN H1 campaign now." trap on_exit EXIT # The §3.4 calibration GATE is the FIRST step of every run; a vacuous DUT aborts - # before any cycle counts. + # (exit 3) before any cycle counts. cmd_calibrate cmd_cycle - emit_evidence "PASS" - H1_DONE=1 + finish "PASS" pass "H1 PASSED locally for medium '${H1_DUT_MEDIUM}': ${CYCLES_PASS} consecutive cycles, H2 probe ${H2_PROBE}. The OWNER signs off on #18 — the agent never self-certifies H1." } diff --git a/src/bin/storage_probe.rs b/src/bin/storage_probe.rs new file mode 100644 index 0000000..4f381a7 --- /dev/null +++ b/src/bin/storage_probe.rs @@ -0,0 +1,143 @@ +//! Storage probe — M8 / §14.8 H1, §3.4 cut-mechanism calibration (owner-run). +//! +//! The §3.4 vacuous-pass GATE: before any H1 cycle is counted, prove the DUT +//! medium **genuinely loses un-synced data** across a real mains cut. If it does +//! not (mounted `sync`, a lying-cache that's actually battery-backed, the WAL dir +//! landing on an overlay instead of the DUT partition, …), every H1 result on it +//! would be VACUOUS — the data was never at risk — which is the single worst +//! outcome this milestone can produce. So we measure loss, hard. +//! +//! WHY A BINARY, NOT THE SHELL `storage-check.sh probe-*`. The calibration must +//! exercise the **same kernel write path the WAL uses**, so that "un-synced data +//! is lost here" actually predicts "an un-acked WAL record is lost here." This +//! binary writes the marker with a plain `write(2)` and **deliberately omits the +//! `fdatasync`** — exactly the WAL's data path (`File` positioned write + +//! `segment::sync_data_fully`) minus the durability step. A shell `echo` could +//! differ subtly (an implicit flush, an `O_SYNC`/mount interaction) and mis-measure +//! the very thing the gate exists to catch. It is **test-only** (excluded from the +//! published crate by `exclude = ["src/bin"]`, like `crash_child`/`power_pull_*`). +//! +//! It is NOT modelled with the WAL `append`: `append` is pure in-memory until +//! `commit`, so the bytes would never reach the page/device cache — the wrong model +//! for "data that hit the device but wasn't fsync'd." +//! +//! Usage (run on the DUT, over ssh from the controller's `h1-cycle.sh calibrate`): +//! storage_probe write-unsynced-marker # write marker, NO fdatasync; exit 0 +//! # --- real mains power cut + reboot --- +//! storage_probe verify-marker-gone # exit 0 = GONE (honest), 1 = SURVIVED (vacuous) +//! +//! The verify check is one-directional and fail-safe: a *surviving* marker (exit 1) +//! aborts H1, and a stale already-synced marker can only ever cause a (safe) vacuous +//! abort, never a false PASS. + +use std::io::Write; +use std::path::Path; +use std::process::ExitCode; + +/// Marker filename, written into the DUT WAL dir. Distinct from any `*.wal` segment +/// so it cannot collide with a WAL; the calibrate step uses a dir with no live WAL. +const MARKER_NAME: &str = ".storage_probe_marker"; + +fn usage(arg0: &str) -> ExitCode { + eprintln!( + "usage: {arg0} \n\ + \n\ + §3.4 calibration loss-probe (owner-run across a REAL power cut):\n \ + write-unsynced-marker write a marker via write(2) with NO fdatasync\n \ + verify-marker-gone exit 0 = marker GONE (storage lost it; honest cut)\n \ + {extra:>32}exit 1 = marker SURVIVED (vacuous; abort H1)", + extra = "" + ); + ExitCode::from(2) +} + +/// Write the marker via the WAL's data write path, then exit WITHOUT syncing — so +/// the bytes live only in the page/device cache, exactly like an un-acked WAL write. +fn write_unsynced_marker(dir: &Path) -> ExitCode { + let marker = dir.join(MARKER_NAME); + // Same primitive as a WAL segment data write: a plain `File` write(2). We + // create+truncate+write and then DROP without `sync_data`/`sync_all` — Rust does + // not fsync on drop, so nothing here forces the bytes to stable storage. + let mut f = match std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&marker) + { + Ok(f) => f, + Err(e) => { + eprintln!("storage_probe: cannot open {}: {e}", marker.display()); + return ExitCode::from(2); + } + }; + // A non-trivial payload so the write is a real data write, not a zero-length + // metadata-only op. Content is irrelevant to the verify (existence is the test); + // a pid nonce just aids human debugging of a captured marker. + let body = format!( + "m8-storage-probe unsynced marker; pid={}; do NOT fsync; a REAL cut must erase this\n", + std::process::id() + ); + if let Err(e) = f.write_all(body.as_bytes()) { + eprintln!("storage_probe: write failed on {}: {e}", marker.display()); + return ExitCode::from(2); + } + // Flush the std layer to the OS (the write(2) syscall) — but emphatically NOT + // `sync_data`/`sync_all`. The marker is now in the cache, at risk of the cut. + if let Err(e) = f.flush() { + eprintln!("storage_probe: flush failed on {}: {e}", marker.display()); + return ExitCode::from(2); + } + drop(f); // no implicit fsync on drop in Rust + eprintln!( + "storage_probe: wrote UN-SYNCED marker {} — now cut power HARD (mains), then verify-marker-gone.", + marker.display() + ); + ExitCode::SUCCESS +} + +/// After the cut+reboot: the marker MUST be gone. Present ⇒ vacuous (exit 1). +fn verify_marker_gone(dir: &Path) -> ExitCode { + let marker = dir.join(MARKER_NAME); + // `symlink_metadata` (lstat) — existence only, no follow, no read that could be + // confused by an empty file. Either it's there or it isn't. + match std::fs::symlink_metadata(&marker) { + Ok(_) => { + eprintln!( + "storage_probe: the un-synced marker SURVIVED the cut ({}). Storage did NOT lose \ + un-synced data ⇒ a power-pull/H1 result here would be VACUOUS. Do NOT certify H1.", + marker.display() + ); + ExitCode::from(1) + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + eprintln!( + "storage_probe: marker is GONE after the cut — storage genuinely loses un-synced \ + data. H1 on this DUT is meaningful (proceed to the acked-LSN cycle loop)." + ); + ExitCode::SUCCESS + } + Err(e) => { + // Cannot determine ⇒ do NOT claim the cut was honest. Treat as a usage/ + // environment error (exit 2), never a silent pass. + eprintln!( + "storage_probe: cannot stat {} ({e}) — cannot confirm loss; NOT a pass.", + marker.display() + ); + ExitCode::from(2) + } + } +} + +fn main() -> ExitCode { + let a: Vec = std::env::args().collect(); + let arg0 = a.first().map(String::as_str).unwrap_or("storage_probe"); + if a.len() < 3 { + return usage(arg0); + } + let dir = Path::new(&a[2]); + match a[1].as_str() { + "write-unsynced-marker" => write_unsynced_marker(dir), + "verify-marker-gone" => verify_marker_gone(dir), + _ => usage(arg0), + } +}