From 345d8531313955f74ad4a32724dbd74e4fb09ea5 Mon Sep 17 00:00:00 2001
From: Dario <zeugmaster@protonmail.com>
Date: Fri, 22 May 2026 01:50:53 +0200
Subject: [PATCH] Re-anchor benchmark to the NUT-00 BLS12-381 (v3) spec
 (nuts#371)

Track the actual spec (cashubtc/nuts#371) instead of the nutshell PR #999
implementation draft. The core BDHKE already matched; this adds what the
spec mandates and regenerates all numbers on the ESP32-C3.

esp32c3-bench-blst:
- Mandatory point validation (NUT-00 Point Validation): every received
  B_/C_/C (G1) and K (G2) is uncompressed from canonical bytes and rejected
  unless on-curve, non-identity, and in the prime-order subgroup. Timed ops
  take the wire bytes so validation is in the measured cost; each keyset key
  is validated once.
- Deterministic Fiat-Shamir batch weights: BLS_BATCH_DST transcript ->
  SHA-256 challenge -> per-proof rejection sampling in Fr* against
  BLS_FR_ORDER, replacing arbitrary scalars.
- Spec-conformance gate against the NUT-00 test vectors (tests/00-tests.md):
  single-proof Y/K/B_/C_/C plus the batch challenge and both weights, all
  byte-exact on-chip; MPI mul_mont stays bit-exact.
- New point-validation primitive timings.

Docs (README, RESULTS, Cargo.toml, legacy/*): re-anchor PR #999 -> nuts#371,
K2 -> K, DLEQ scoped to secp256k1 (NUT-12). Numbers regenerated for portable-C
(State 1) and MPI-all (State 3): ~4.5x across the board; typical 10-proof
token ~0.93 s (MPI), at parity with the secp256k1+DLEQ wallet.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.toml                             |   7 +-
 README.md                              |  68 +--
 RESULTS.md                             | 260 +++++++-----
 esp32c3-bench-blst/src/main.rs         | 560 ++++++++++++++++++-------
 legacy/README.md                       |  12 +-
 legacy/crypto/src/lib.rs               |  16 +-
 legacy/esp32c3-bench/src/main.rs       |   4 +-
 legacy/host-bench/benches/cashu_bls.rs |   6 +-
 8 files changed, 625 insertions(+), 308 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 990cde7..c316cf1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,12 +1,13 @@
 [workspace]
 resolver = "2"
 members = [
-    # The bench that matters — ports cashubtc/nutshell PR #999 (multiplicative
-    # blinding, blst + MPI peripheral). See README.md / RESULTS.md.
+    # The bench that matters — implements NUT-00's BLS12-381 (v3) protocol
+    # (cashubtc/nuts#371): multiplicative blinding, mandatory point validation,
+    # Fiat-Shamir batch weights, blst + MPI peripheral. See README.md / RESULTS.md.
     "esp32c3-bench-blst",
     # Original-ESP32 (Xtensa) MPI hardware microbench.
     "esp32-bench-mpi",
-    # SUPERSEDED: an early mock with *additive* blinding (not PR #999) on the
+    # SUPERSEDED: an early mock with *additive* blinding (not NUT-00 v3) on the
     # pure-Rust zkcrypto backend. Kept only for the historical pure-Rust-vs-blst
     # comparison; see legacy/README.md and legacy/crypto/src/lib.rs.
     "legacy/crypto",
diff --git a/README.md b/README.md
index 0b6777f..a68ebd5 100644
--- a/README.md
+++ b/README.md
@@ -1,42 +1,54 @@
 # bls-bench
 
 Benchmarks the BLS12-381 cryptography that Cashu would need if it migrates
-BDHKE from secp256k1 to BLS12-381 — tracking [cashubtc/nutshell PR #999][pr]
-(`feat(crypto): migrate BDHKE to BLS12-381 (v3 keysets)`) — running on an
-**ESP32-C3**, including a path that offloads the field arithmetic to the chip's
-RSA/MPI peripheral.
+BDHKE from secp256k1 to BLS12-381 — implementing [NUT-00's BLS12-381 (v3)
+protocol][pr] (cashubtc/nuts PR #371, keysets with version byte `02`) — running
+on an **ESP32-C3**, including a path that offloads the field arithmetic to the
+chip's RSA/MPI peripheral.
 
-[pr]: https://github.com/cashubtc/nutshell/pull/999
+[pr]: https://github.com/cashubtc/nuts/pull/371
 
 ## The bench that matters: `esp32c3-bench-blst/`
 
-This is the one to look at. It ports PR #999's BLS scheme faithfully (see
-`esp32c3-bench-blst/src/main.rs` and the comparison notes below):
+This is the one to look at. It implements NUT-00's BLS12-381 (v3) protocol
+faithfully (see `esp32c3-bench-blst/src/main.rs`), and a startup gate checks it
+against the spec's test vectors (`tests/00-tests.md`) byte-for-byte:
 
-- **Multiplicative blinding** — `B' = r·Y`, `C' = a·B'`, `C = r⁻¹·C' = a·Y`.
+- **Multiplicative blinding** — `B_ = r·Y`, `C_ = a·B_`, `C = r⁻¹·C_ = a·Y`.
   No point additions in the BDHKE steps, no `− r·K` unblind.
-- **Mint pubkey `K2 = a·G2` on G2 only** — 96-byte keyset keys; no G1 mint
-  key. (Additive blinding would need the key on *both* G1 and G2 = 144 bytes —
-  that's why the PR went multiplicative.)
+- **Mint pubkey `K = a·G2` on G2 only** — 96-byte keyset keys; no G1 mint key.
+  (Additive blinding would need the key on *both* G1 and G2 = 144 bytes —
+  that's why the spec is multiplicative.)
 - **Hash-to-G1 via RFC 9380 SSWU**, DST `CASHU_BLS12_381_G1_XMD:SHA-256_SSWU_RO_`.
-- **Wallet verify** `e(C, G2) == e(Y, K2)`; **batch verify** collapses N proofs
-  into `1 + U` Miller loops (U = unique keysets) via random linear combinations.
-- **DLEQ removed** — pairings make it redundant.
+- **Mandatory point validation** (NUT-00 §Point Validation, flagged CRITICAL):
+  every received `B_`/`C_`/`C`/`K` is decompressed from canonical bytes and
+  rejected unless on-curve, non-identity, and in the prime-order subgroup
+  (`uncompress` + `in_g1`/`in_g2`). The mint validates `B_` before signing; the
+  wallet validates `K`, `C_`, and `C`.
+- **Wallet verify** `e(C, G2) == e(Y, K)`; **batch verify** collapses N proofs
+  into `1 + U` Miller loops (U = unique keysets), with the random-linear-
+  combination weights derived deterministically via a Fiat-Shamir SHA-256
+  transcript + per-proof rejection sampling in `Fr*` (`BLS_BATCH_DST`).
+- **No DLEQ for v3** — NUT-12 scopes DLEQ to secp256k1; the pairing check
+  replaces it.
 
 **Backend:** `blst` 0.3.16, vendored + patched (`esp32c3-bench-blst/vendor/blst/`,
 wired via `[patch.crates-io]`). On RV32IMC blst has no asm path, so it falls back
 to portable C — and the patch routes blst's Montgomery multiply *and* squaring
 (`mul_mont_n`, `mul_mont_nonred_n`, `sqr_mont_382x`) through the C3's RSA/MPI
 peripheral via `mpi_mul_mont_n` in `esp32c3-bench-blst/src/mpi.rs`. The bench
-prints a correctness gate (`keyed_verification` / `pairing_verification` must
-round-trip) and a bit-exact MPI-vs-software `mul_mont` diagnostic.
+prints a spec-conformance gate (the NUT-00 test vectors — `Y/K/B_/C_/C`, the
+batch challenge, and the rejection-sampled weights — must match byte-for-byte)
+and a bit-exact MPI-vs-software `mul_mont` diagnostic.
 
 **Headline numbers** (ESP32-C3 rev v0.4 @ 160 MHz; full table in
-[`RESULTS.md`](RESULTS.md)): portable-C blst does `bdhke_full_round` in 421 ms
-and `pairing_verification` in 1.21 s; with the MPI peripheral those drop to
-**71.5 ms** and **278 ms** — ~4-5× across the board. A 10-proof token verifies
-in **~0.8-1.1 s** on the bare chip — under today's secp256k1+DLEQ wallet
-(~1.5 s), no coprocessor.
+[`RESULTS.md`](RESULTS.md)): portable-C blst does `bdhke_full_round` in 459 ms
+and `pairing_verification` in 1.31 s; with the MPI peripheral those drop to
+**104 ms** and **304 ms** — ~4.5× across the board. A typical 10-proof token
+(all one keyset) batch-verifies in **~0.9 s** on the bare chip, a realistic
+3–4-keyset mix in **~1.1-1.2 s** — at parity with today's secp256k1+DLEQ wallet
+(~1.5 s for 10 proofs), no coprocessor. All figures now include the spec's
+mandatory point validation on every received point.
 
 ## `esp32-bench-mpi/`
 
@@ -49,26 +61,26 @@ with `cargo +esp run --release` from that crate.
 ## ⚠️ Superseded: `legacy/`
 
 The `legacy/crypto/`, `legacy/esp32c3-bench/`, `legacy/host-bench/` crates
-predate PR #999 as a reference. They mock an **additive**-blinding BDHKE
-(`B' = Y + r·G`, `C = C' − r·K`) — which is **not** what PR #999 does, and would
-force the mint key onto both G1 (for the `− r·K` unblind) and G2 (for the
+predate the spec. They mock an **additive**-blinding BDHKE
+(`B' = Y + r·G`, `C = C' − r·K`) — which is **not** what NUT-00 (v3) does, and
+would force the mint key onto both G1 (for the `− r·K` unblind) and G2 (for the
 pairing check) = 144-byte keyset keys. They also still hash with a placeholder
 DST and use the pure-Rust `bls12_381` (zkcrypto) backend rather than `blst`.
 
 They live under [`legacy/`](legacy/) and are **kept only for the historical
 pure-Rust-vs-`blst` per-primitive comparison** (the ~9-40×-per-op gap is
 interesting; see `RESULTS.md`). Do not use them for protocol-accurate numbers —
-use `esp32c3-bench-blst/` for anything that needs to match PR #999.
+use `esp32c3-bench-blst/` for anything that needs to match the NUT-00 spec.
 
 ## Layout
 
 ```
 bls-bench/
-├── esp32c3-bench-blst/     ← the bench that matters: blst + MPI, matches PR #999
+├── esp32c3-bench-blst/     ← the bench that matters: blst + MPI, matches NUT-00 v3
 │   ├── src/mpi.rs            MPI peripheral driver + mpi_mul_mont_n
 │   └── vendor/blst/          vendored+patched blst 0.3.16
 ├── esp32-bench-mpi/        ← original-ESP32 (Xtensa) MPI hardware microbench
-└── legacy/                 ← ⚠️ SUPERSEDED — additive-blinding mock, not PR #999
+└── legacy/                 ← ⚠️ SUPERSEDED — additive-blinding mock, not NUT-00 v3
     ├── crypto/               additive-blinding BDHKE mock (zkcrypto backend)
     ├── esp32c3-bench/        runs the mock on ESP32-C3
     └── host-bench/           criterion host baseline for the mock
@@ -82,7 +94,7 @@ serial. (`esp32-bench-mpi` targets the original ESP32, Xtensa LX6 @ 240 MHz.)
 ## Running
 
 ```bash
-# The PR-#999-accurate bench (board connected, espflash installed)
+# The NUT-00-v3-accurate bench (board connected, espflash installed)
 cd esp32c3-bench-blst && cargo run --release
 
 # Original-ESP32 MPI hardware microbench (also needs `espup install`)
diff --git a/RESULTS.md b/RESULTS.md
index 323d52a..b379934 100644
--- a/RESULTS.md
+++ b/RESULTS.md
@@ -1,35 +1,59 @@
-# Results — Cashu-on-BLS on ESP32-C3, MPI-accelerated
+# Results — Cashu-on-BLS (NUT-00 v3) on ESP32-C3, MPI-accelerated
 
-All numbers below are from **`esp32c3-bench-blst/`** — the bench that ports
-[cashubtc/nutshell PR #999][pr] faithfully: multiplicative blinding
-(`B' = r·Y`, `C = r⁻¹·C' = a·Y`), mint pubkey `K2 = a·G2` on G2 only
+All numbers below are from **`esp32c3-bench-blst/`** — the bench that implements
+[NUT-00's BLS12-381 (v3) protocol][pr] (cashubtc/nuts PR #371): multiplicative
+blinding (`B_ = r·Y`, `C = r⁻¹·C_ = a·Y`), mint pubkey `K = a·G2` on G2 only
 (96-byte keyset keys), RFC 9380 SSWU hash-to-G1 with DST
-`CASHU_BLS12_381_G1_XMD:SHA-256_SSWU_RO_`, wallet-side `e(C, G2) == e(Y, K2)`,
-batch verify via random linear combinations grouped by unique keyset, DLEQ
-removed. Backend: `blst` 0.3.16, vendored + patched, portable C on RV32IMC,
-with the Montgomery multiply/square offloaded to the chip's RSA/MPI peripheral.
-
-The `legacy/crypto/` + `legacy/esp32c3-bench/` + `legacy/host-bench/` crates are a *superseded*
-mock that used additive blinding and the pure-Rust `bls12_381` backend — see
-the bottom of this file for the (still-interesting) pure-Rust-vs-`blst` gap,
-and `legacy/crypto/src/lib.rs` for why those are no longer the reference.
-
-[pr]: https://github.com/cashubtc/nutshell/pull/999
+`CASHU_BLS12_381_G1_XMD:SHA-256_SSWU_RO_`, wallet-side `e(C, G2) == e(Y, K)`,
+**mandatory point validation** on every received point, and **deterministic
+Fiat-Shamir batch verification** (SHA-256 transcript + rejection-sampled weights
+in `Fr*`). No DLEQ — NUT-12 scopes that to secp256k1. Backend: `blst` 0.3.16,
+vendored + patched, portable C on RV32IMC, with the Montgomery multiply/square
+offloaded to the chip's RSA/MPI peripheral.
+
+A startup **spec-conformance gate** checks the implementation against the NUT-00
+test vectors (`tests/00-tests.md`) byte-for-byte; every run below printed
+`match=true` for `Y/K/B_/C_/C`, the batch `challenge`, and both rejection-sampled
+weights, plus a bit-exact MPI-vs-software `mul_mont` diagnostic.
+
+The `legacy/` crates are a *superseded* mock (additive blinding, pure-Rust
+`bls12_381` backend) — see the appendix for the (still-interesting)
+pure-Rust-vs-`blst` gap, and `legacy/crypto/src/lib.rs` for why they're no longer
+the reference.
+
+[pr]: https://github.com/cashubtc/nuts/pull/371
+
+## What changed vs the pre-spec numbers
+
+This revision tracks the *spec* (nuts#371) rather than an implementation draft,
+and the timings rose accordingly — they now pay for work the spec mandates that
+the earlier bench skipped:
+
+- **Point validation (NUT-00 §Point Validation, flagged CRITICAL).** Every
+  received `B_`/`C_`/`C` (G1) and `K` (G2) is decompressed from canonical bytes
+  and checked for on-curve, non-identity, and prime-order-subgroup membership
+  (`uncompress` + `in_g1`/`in_g2`). That adds one G1 validation (~12 ms MPI) to
+  each per-proof op and to every batched proof, and one G2 validation (~21 ms
+  MPI) per distinct keyset.
+- **Fiat-Shamir batch weights.** Batch verify now derives its random-linear-
+  combination weights from a SHA-256 transcript over the proofs with per-proof
+  rejection sampling against `BLS_FR_ORDER` (≈ 2.2 hashes/proof), instead of
+  using arbitrary scalars.
 
 ## TL;DR
 
-- **blst portable C on the bare RV32:** `bdhke_full_round` 421 ms,
-  `pairing_verification` 1.21 s, batch-verifying a 10-proof token ≈ 3.7 s.
+- **blst portable C on the bare RV32:** `bdhke_full_round` 459 ms,
+  `pairing_verification` 1.31 s, a typical 10-proof token ≈ 4.2 s.
   Interactive-ish but slow.
 - **Routing blst's Montgomery mul/sqr through the C3's RSA/MPI peripheral**
   (a vendored `no_asm.h` patch — `mul_mont_n`, `mul_mont_nonred_n`,
-  `sqr_mont_382x` all call the hardware): **~4-5× across the board**.
-  `bdhke_full_round` **71.5 ms**, `pairing_verification` **278 ms**, a
-  10-proof token **≈ 0.8-1.1 s** — under today's secp256k1+DLEQ wallet
-  (~1.5 s for 10 proofs), on the same chip, no coprocessor.
-- Correctness gate green throughout (`keyed_verification` / `pairing_verification`
-  round-trip the issued proof), and an in-bench diagnostic confirms the MPI
-  `mul_mont` is bit-exact vs blst's software version.
+  `sqr_mont_382x` all call the hardware): **~4.5× across the board.**
+  `bdhke_full_round` **104 ms**, `pairing_verification` **304 ms**, a typical
+  10-proof token (one keyset) **≈ 0.93 s**, a realistic 3–4-keyset mix
+  **≈ 1.1-1.2 s** — at parity with today's secp256k1+DLEQ wallet (~1.5 s for
+  10 proofs), on the same chip, no coprocessor.
+- Spec-conformance gate green throughout (the NUT-00 test vectors round-trip
+  byte-for-byte), and the MPI `mul_mont` is bit-exact vs blst's software.
 
 ## Setup
 
@@ -42,84 +66,88 @@ and `legacy/crypto/src/lib.rs` for why those are no longer the reference.
   RSA peripheral.
 - **Profile:** release, `lto = "fat"`, `opt-level = 3`, `codegen-units = 1`,
   `-O3` for the C side. C cross-compiled with Apple Clang via the `cc` crate.
-- Each line below: 3-32 timed reps after one warm-up; min / median / max in µs.
-
-## ESP32-C3 — Cashu-on-BLS operations, three states
-
-State 1 = portable-C blst (no MPI). State 2 = MPI for `mul_mont_n` only.
-State 3 = MPI for everything in the Fp/Fp²/Fp⁶/Fp¹² tower (`mul_mont_n` +
-`mul_mont_nonred_n` + `sqr_mont_382x` rewritten to reduced-input form).
-"speedup" is State 3 vs State 1.
-
-| Operation | portable C | MPI (`mul_mont_n`) | **MPI (all)** | speedup |
-|---|---:|---:|---:|---:|
-| `step1_alice` (blind, `B' = r·Y`) | 195 ms | 47.8 ms | **32.8 ms** | 5.9× |
-| `step2_bob` (mint sign, `C' = a·B'`) | 132 ms | 18.5 ms | **18.5 ms** | 7.1× |
-| `step3_alice` (unblind, `C = r⁻¹·C'`) | 93 ms | 19.8 ms | **20.0 ms** | 4.6× |
-| `keyed_verification` (`C == a·Y`) | 199 ms | 51.0 ms | **36.0 ms** | 5.5× |
-| `bdhke_full_round` (step1+step2+step3) | 421 ms | 86.5 ms | **71.5 ms** | 5.9× |
-| `pairing_verification` (1 proof) | 1205 ms | 292.6 ms | **278.7 ms** | 4.3× |
-| `batch_verify_same_key`  N=1 | 1471 ms | 333 ms | **319 ms** | 4.6× |
-| ⋯ N=2 | 1722 ms | 402 ms | **373 ms** | 4.6× |
-| ⋯ N=4 | 2223 ms | 539 ms | **481 ms** | 4.6× |
-| ⋯ N=8 | 3226 ms | 815 ms | **697 ms** | 4.6× |
-| ⋯ N=16 | 5231 ms | 1367 ms | **1128 ms** | 4.6× |
-| ⋯ N=32 | 9241 ms | 2470 ms | **1991 ms** | 4.6× |
-| `batch_verify_distinct`  N=1 | 1471 ms | 333 ms | **319 ms** | 4.6× |
-| ⋯ N=2 | 2096 ms | 482 ms | **453 ms** | 4.6× |
-| ⋯ N=4 | 3348 ms | 779 ms | **721 ms** | 4.6× |
-| ⋯ N=8 | 5851 ms | 1372 ms | **1257 ms** | 4.7× |
-| ⋯ N=16 | 10858 ms | 2560 ms | **2328 ms** | 4.7× |
-
-`step2_bob` and `step3_alice` don't move between State 2 and State 3 — they're
-pure G1 scalar multiplications, already all `mul_mont_n`. The State-3 wins land
-where `hash_to_g1` (Fp square roots via `sqr_n_mul_mont_383`) and `final_exp`
-(Fp¹² cyclotomic squaring via `sqr_mont_382x`) are: `step1_alice` −31%,
-`keyed_verification` −29%, and the batch per-proof slope dropped 69 → 54 ms.
+- Each line below: 3-32 timed reps after one warm-up; the tables show medians.
+
+## ESP32-C3 — Cashu-on-BLS (v3) operations
+
+**State 1** = portable-C blst (no MPI). **State 3** = MPI for the whole
+Fp/Fp²/Fp⁶/Fp¹² tower (`mul_mont_n` + `mul_mont_nonred_n` + `sqr_mont_382x`).
+"speedup" is State 1 / State 3. (The original write-up also profiled an
+intermediate *State 2* — MPI for `mul_mont_n` only; this revision re-measures the
+two endpoints against the spec-accurate code and omits it.)
+
+| Operation | portable C | **MPI (all)** | speedup |
+|---|---:|---:|---:|
+| `step1_alice` (blind, `B_ = r·Y`) | 152 ms | **34.5 ms** | 4.4× |
+| `step2_bob` (validate `B_` + sign `C_ = a·B_`) | 153 ms | **34.0 ms** | 4.5× |
+| `step3_alice` (validate `C_` + unblind `C = r⁻¹·C_`) | 154 ms | **35.1 ms** | 4.4× |
+| `keyed_verification` (validate `C`, check `C == a·Y`) | 214 ms | **47.4 ms** | 4.5× |
+| `bdhke_full_round` (step1+step2+step3) | 459 ms | **104 ms** | 4.4× |
+| `pairing_verification` (1 proof) | 1311 ms | **304 ms** | 4.3× |
+| `batch_verify_same_key`  N=1 | 1503 ms | **347 ms** | 4.3× |
+| ⋯ N=2 | 1806 ms | **412 ms** | 4.4× |
+| ⋯ N=4 | 2411 ms | **542 ms** | 4.4× |
+| ⋯ N=8 | 3621 ms | **802 ms** | 4.5× |
+| ⋯ N=16 | 6041 ms | **1323 ms** | 4.6× |
+| ⋯ N=32 | 10881 ms | **2365 ms** | 4.6× |
+| `batch_verify_distinct`  N=1 | 1503 ms | **347 ms** | 4.3× |
+| ⋯ N=2 | 2256 ms | **510 ms** | 4.4× |
+| ⋯ N=4 | 3762 ms | **838 ms** | 4.5× |
+| ⋯ N=8 | 6773 ms | **1494 ms** | 4.5× |
+| ⋯ N=16 | 12795 ms | **2806 ms** | 4.6× |
+
+`same_key` keeps all N proofs under one keyset (`K` validated once); `distinct`
+gives each proof its own keyset, so the RHS gets one Miller loop + one G2
+validation per key — the batch worst case. The MPI speedup is uniform (~4.3-4.6×)
+because every operation bottoms out in the same Fp Montgomery kernel.
 
 ### Primitives (measured directly)
 
 | Primitive | portable C blst | **MPI (all)** | pure-Rust `bls12_381` |
 |---|---:|---:|---:|
-| `hash_to_g1` — RFC 9380 SSWU, `expand_message_xmd<SHA-256>`, Cashu DST | 60.7 ms | **13.4 ms** | 884 ms |
-| `g1_scalar_mul` — one mul of an arbitrary G1 point | 128 ms | **18.5 ms** | 3.81 s |
+| `hash_to_g1` — RFC 9380 SSWU, `expand_message_xmd<SHA-256>`, Cashu DST | 60.6 ms | **13.4 ms** | 884 ms |
+| `g1_scalar_mul` — one mul of an arbitrary G1 point | 88.6 ms | **18.4 ms** | 3.81 s |
+| `point_validate_g1` — `uncompress` + `in_g1` (per received G1: `B_`/`C_`/`C`) | 61.1 ms | **12.2 ms** | — |
+| `point_validate_g2` — `uncompress` + `in_g2` (per keyset key `K`) | 87.8 ms | **21.0 ms** | — |
 
-(`hash_to_g1` does: `expand_message_xmd` → two Fp elements → SSWU map each →
-add → clear cofactor. `step1_alice` ≈ `hash_to_g1` + one `g1_scalar_mul`:
-13.4 + 18.5 ≈ 32 ms, matching the 32.8 ms measured. The pure-Rust column is the
-legacy `bls12_381` mock — different DST, treat as order-of-magnitude.)
+(`step1_alice` ≈ `hash_to_g1` + `g1_scalar_mul` + serialisation: 13.4 + 18.4 ≈
+32 ms, matching the 34.5 ms measured. The pure-Rust column is the legacy
+`bls12_381` mock — different DST, no validation — treat as order-of-magnitude.)
 
 ### F_p Montgomery multiply — the kernel
 
 | | portable C blst | MPI peripheral |
 |---|---:|---:|
-| one `blst_fp_mul` (single call, incl. operand I/O) | 44 µs | 7-8 µs |
-| `blst_fp_mul` ×100 chained (per-call, steady state) | 43.4 µs | 6.3 µs |
+| one `blst_fp_mul` (single call, incl. operand I/O) | 44 µs | 8 µs |
+| `blst_fp_mul` ×100 chained (per-call, steady state) | 43.6 µs | 6.4 µs |
 
-**≈ 6.9× per Montgomery multiplication.** Of the 6.3 µs MPI per-call cost,
+**≈ 6.9× per Montgomery multiplication.** Of the 6.4 µs MPI per-call cost,
 roughly 1.5 µs is the actual peripheral multiply and ~5 µs is the operand RAM
 I/O — which is why the integration caches the modulus / `n0` / zero exponent
 resident in the peripheral and skips re-writing operand A on `t = t·x` chains.
+(In State 3 the bench's own `blst_fp_mul` reads ~7 µs because it too routes
+through the peripheral; the 44 µs above is the State-1 portable-C measurement.)
 
-## Linear models & wallet-flow projections (State 3)
+## Linear models & wallet-flow projections (State 3, MPI)
 
-- same-keyset batch verify: **≈ 265 ms + 54 ms·N**
-- distinct-keyset batch verify: **≈ 185 ms + 134 ms·N**
-  (the ~134 ms/proof ≈ 54 ms per-proof work + ~80 ms for the extra RHS Miller
-  loop the new keyset introduces)
+- same-keyset batch verify: **≈ 282 ms + 65 ms·N**
+- distinct-keyset batch verify: **≈ 183 ms + 164 ms·N**
+  (each extra unique keyset adds ≈ 99 ms — one RHS Miller loop + its `fp12_mul`
+  + one G2 validation)
 
 For a **10-proof token** (Cashu denominations are powers of two, so a token's
 proof count = popcount of the amount):
 
 | Scenario | portable C | MPI (all) |
 |---|---:|---:|
-| all proofs under one keyset (typical) | 3.72 s | **≈ 805 ms** |
-| every proof a distinct keyset (worst case) | 7.15 s | **≈ 1.53 s** |
-| realistic mix (3-4 unique keysets) | ~4 s | **≈ 1.0-1.1 s** |
+| all proofs under one keyset (typical) | 4.23 s | **≈ 0.93 s** |
+| realistic mix (3–4 unique keysets) | ~5.1-5.6 s | **≈ 1.1-1.2 s** |
+| every proof a distinct keyset (worst case) | 8.28 s | **≈ 1.82 s** |
 
 For comparison, today's secp256k1 wallet with NUT-12 DLEQ verification handles a
 10-proof token in roughly 1.5 s on the same chip — so **MPI-accelerated
-Cashu-on-BLS is at parity or faster**, with no extra hardware.
+Cashu-on-BLS is at parity** for typical and realistic loads (~0.9-1.2 s), with
+the all-distinct-keyset worst case (~1.8 s) modestly above, and no extra hardware.
 
 ## How the MPI integration works
 
@@ -144,36 +172,54 @@ and exits — recovering the ~50% the C3's standard "double mul" modmult wastes,
 [per the ESP32 forum][forum]). For the rare 256-bit Fr `mul_mont` it falls back
 to a portable Rust schoolbook Montgomery multiply.
 
-A pre-existing bug in the bench's BDHKE was also fixed along the way: scalars
-were handed to `blst_p1_mult` / `blst_p2_mult` in *big-endian*, but those take
-*little-endian*, so `step1` used a byte-reversed `r` and `step3` a byte-reversed
-`r⁻¹` — `reverse(r⁻¹)·reverse(r) ≠ 1`, the blinding never cancelled, and
-verification silently failed. Fixed (`scalar_to_be` → `scalar_to_le`) and a
-correctness gate added so it can't regress quietly again.
+A pre-existing bug in the bench's BDHKE was fixed earlier: scalars were handed to
+`blst_p1_mult` / `blst_p2_mult` in *big-endian*, but those take *little-endian*,
+so `step1` used a byte-reversed `r` and `step3` a byte-reversed `r⁻¹` —
+`reverse(r⁻¹)·reverse(r) ≠ 1`, the blinding never cancelled, and verification
+silently failed. Fixed (`scalar_to_le`), and the spec-conformance gate now makes
+any such regression loud.
 
 [forum]: https://www.esp32.com/viewtopic.php?t=23830
 
+## Spec conformance
+
+The startup gate reproduces the NUT-00 test vectors (`tests/00-tests.md`) on-chip:
+
+- **Single-proof round-trip** (secret `"test_message"`, `r = 3`, `a = 2`):
+  `Y = hash_to_curve_G1(secret)`, `K = a·G2`, `B_ = r·Y`, `C_ = a·B_`,
+  `C = r⁻¹·C_` — all five compressed encodings match the vector byte-for-byte,
+  and `e(C, G2) == e(Y, K)`.
+- **Batch** (two proofs under `K = 2·G2`): the SHA-256 `challenge` and both
+  rejection-sampled weights match the vector — `weight_1` is accepted at
+  `ctr = 4`, `weight_2` at `ctr = 0`, exercising the reject-and-resample path —
+  and the multi-pairing check returns `true`.
+
+One subtlety the `weight_1` vector pins down: the weight derivation must compare
+each candidate hash against `BLS_FR_ORDER` *directly*. `blst_scalar_from_be_bytes`
+reduces an out-of-range input modulo the order (and reports success) rather than
+rejecting it, so using its boolean as the `< order` test would silently accept a
+reduced value at `ctr = 0` instead of resampling to `ctr = 4`.
+
 ## Caveats
 
-- **The verification timings measure the crypto functions, not the
-  proof-deserialization around them.** PR #999's `pairing_verification` /
-  `batch_pairing_verification` take already-parsed `PublicKey` objects, so the
-  bench matches the functions exactly — but a wallet verifying a *received*
-  token first `uncompress`es each proof's 48-byte G1 `C` (≈ 1 Fp square root,
-  ~3 ms with MPI). For a 10-proof token that's ≈ +30 ms on the ~800 ms total
-  → the bench under-counts the end-to-end cost by ~4%. (Keyset `K2`s are
-  uncompressed once at keyset load and cached.)
-- **PR #999 has no subgroup-membership checks** on received points
-  (`BlsPublicKey(bytes.fromhex(...))` just `uncompress`es). A hardened version
-  would add `blst_p1_in_g1(C)` / `blst_p2_in_g2(K2)` (≈ +1 endomorphism check
-  per point) to defend against subgroup-confusion attacks on the pairing.
-  Not modelled here either.
-- `keyed_verification` compares points by compressing both to 48 bytes; the PR
-  uses native point equality — a slight *over*-estimate of that mint-side op.
-- Test secrets are short ASCII; real Cashu secrets are longer JSON-ish strings
-  → at most one extra SHA-256 block in `expand_message_xmd` → negligible.
-- `mul_mont_sparse_256` (256-bit Fr) is still software — rare in this workload
-  (`blst_fr` conversions in `step3`'s inversion path), a few calls per round.
+- **Mint keys are validated once per keyset, not per proof.** A wallet loads a
+  keyset's `K`, validates it once (`uncompress` + `in_g2`, ~21 ms MPI) and caches
+  it; `batch_pairing_verification` mirrors this (each distinct `K` validated on
+  first use). The per-proof received points (`B_`, `C_`, `C`) are validated
+  inline every time, since they arrive fresh. (The standalone
+  `pairing_verification` row validates its `K` on the call — a cold verify; a
+  warm wallet with the keyset already cached saves that ~21 ms.)
+- **The verification timings measure the crypto, not proof JSON/CBOR parsing.**
+  The bench feeds already-deserialised secrets and the compressed point bytes; a
+  real wallet also parses the surrounding token structure.
+- `keyed_verification` compares `C` to `a·Y` by compressing both to 48 bytes; a
+  mint using native point equality would shave a little off that mint-side op.
+- Test secrets are short ASCII; real Cashu secrets are longer (a 64-char hex
+  string is recommended) → at most one extra SHA-256 block in
+  `expand_message_xmd` → negligible.
+- `mul_mont_sparse_256` (256-bit `Fr`) is still software — rare in this workload
+  (the `blst_fr` inversion in `step3` plus the batch-weight scalars), a few calls
+  per round.
 - Single-task on a quiet chip. With Wi-Fi + a UI competing, expect 1.5-2×
   degradation.
 - `blst`'s portable C is constant-time but has not been side-channel-audited on
@@ -186,14 +232,14 @@ cd esp32c3-bench-blst && cargo run --release
 ```
 
 Flashes the C3 and streams the timings over UART. The run prints the F_p kernel
-microbench, the correctness gate, the MPI-vs-software bit-exact diagnostic, and
-then the full Cashu-on-BLS operation sweep.
+microbench, the spec-conformance gate, the MPI-vs-software bit-exact diagnostic,
+and then the full Cashu-on-BLS operation sweep.
 
 ## Appendix — pure-Rust (`bls12_381`) vs `blst`, on RV32
 
-From the *superseded* `legacy/crypto/` + `legacy/esp32c3-bench/` mock (additive blinding, a
-placeholder DST — so **not** an op-for-op match with the table above; treat as
-order-of-magnitude only). It's why this project uses `blst`:
+From the *superseded* `legacy/crypto/` + `legacy/esp32c3-bench/` mock (additive
+blinding, a placeholder DST — so **not** an op-for-op match with the table above;
+treat as order-of-magnitude only). It's why this project uses `blst`:
 
 | Operation | `bls12_381` 0.8 (pure Rust) | `blst` portable C | gap |
 |---|---:|---:|---:|
@@ -206,4 +252,4 @@ order-of-magnitude only). It's why this project uses `blst`:
 The win is algorithmic, not assembly — dedicated squaring, lazy reduction in
 F_p¹², optimal addition chains, GLV decomposition, and Clang generating better
 code from blst's carry-aware C than rustc does from trait-heavy Rust. On top of
-that, the MPI-peripheral patch above adds another ~4-5× over portable-C blst.
+that, the MPI-peripheral patch above adds another ~4.5× over portable-C blst.
diff --git a/esp32c3-bench-blst/src/main.rs b/esp32c3-bench-blst/src/main.rs
index 45351a4..99c5da1 100644
--- a/esp32c3-bench-blst/src/main.rs
+++ b/esp32c3-bench-blst/src/main.rs
@@ -1,18 +1,29 @@
-//! Cashu-on-BLS bench mirroring the nutshell fork at
-//! a1denvalu3/nutshell@feature/bls12-381-crypto.
+//! Cashu-on-BLS bench implementing **NUT-00's BLS12-381 (v3) protocol** — the
+//! pairing-based BDHKE specified in cashubtc/nuts PR #371, for keysets with
+//! version byte `02`.
 //!
-//! Differences from the previous bench (which mocked an additive-blinding
-//! BDHKE plus optional Schnorr-style DLEQ):
+//! What the v3 protocol does (and this bench measures):
 //!
-//! - **Multiplicative blinding.** `step1_alice` does `B' = r·Y`; `step3_alice`
-//!   does `C = r⁻¹ · C'`. No point additions. The `r⁻¹` uses `blst_fr_inverse`.
-//! - **Mint pubkey lives in G2.** `K2 = sk·G2`. Wallet verifies by pairing.
-//! - **DLEQ is removed.** No NUT-12 proof is generated or checked. The wallet
-//!   verifies issuance directly via `e(C, G2) == e(Y, K2)`.
-//! - **Batch verify.** The fork's `batch_pairing_verification` collapses N
-//!   pairings into 2 miller loops + 1 final_exp via random linear combinations.
-//!   We bench it across a range of N so the linear-in-N preprocessing and the
-//!   constant pairing tail can be separated.
+//! - **Multiplicative blinding.** `step1_alice` does `B_ = r·Y`; `step3_alice`
+//!   does `C = r⁻¹·C_`. No point additions. The `r⁻¹` uses `blst_fr_inverse`.
+//! - **Mint pubkey lives in G2.** `K = a·G2` (the spec calls it `K`; the code
+//!   names the local `k2` to signal the group). Wallet verifies by pairing
+//!   `e(C, G2) == e(Y, K)`. No DLEQ — NUT-12 scopes DLEQ to secp256k1 only.
+//! - **Mandatory point validation.** Per NUT-00 §Point Validation (flagged
+//!   CRITICAL), every *received* `B_`, `C_`, `C`, `K` is deserialised from
+//!   canonical compressed bytes and rejected if non-canonical, identity,
+//!   off-curve, or not in the prime-order subgroup (`uncompress` + `is_inf` +
+//!   `in_g1`/`in_g2`). The timed ops take the wire (compressed) bytes and do
+//!   this inside the timed region, matching a mint/wallet validating on receipt.
+//! - **Deterministic batch verify.** `batch_pairing_verification` collapses N
+//!   pairings into a single multi-pairing via a Fiat-Shamir random linear
+//!   combination: weights are derived from a SHA-256 transcript over the proofs
+//!   (`BLS_BATCH_DST`) with per-proof rejection sampling in `Fr*`, not chosen
+//!   arbitrarily. We bench it across a range of N so the linear-in-N
+//!   preprocessing and the constant pairing tail can be separated.
+//! - **Spec-conformance gate.** Startup checks the NUT-00 test vectors
+//!   (`tests/00-tests.md`) byte-for-byte: the single-proof round-trip
+//!   (`Y/K/B_/C_/C`) and the batch challenge + rejection-sampled weights.
 //!
 //! Terminology note: a Cashu wallet operation handles many *proofs* (each
 //! proof is one ecash coin of one binary denomination). A user-facing
@@ -30,7 +41,7 @@ mod mpi;
 
 use alloc::vec::Vec;
 use core::hint::black_box;
-use core::mem::{size_of, MaybeUninit};
+use core::mem::MaybeUninit;
 use esp_hal::clock::CpuClock;
 use esp_hal::time::Instant;
 use esp_println::println;
@@ -39,13 +50,44 @@ use blst::{
     blst_bendian_from_scalar, blst_fp, blst_fp12, blst_fp12_finalverify, blst_fp12_mul,
     blst_fp12_one, blst_fp_from_uint32, blst_fp_mul, blst_fp_sqr, blst_fr, blst_fr_from_scalar,
     blst_fr_inverse, blst_hash_to_g1, blst_lendian_from_scalar, blst_miller_loop, blst_p1,
-    blst_p1_add, blst_p1_affine, blst_p1_compress, blst_p1_mult, blst_p1_to_affine, blst_p2,
-    blst_p2_affine, blst_p2_generator, blst_p2_mult, blst_p2_to_affine, blst_scalar,
-    blst_scalar_from_be_bytes, blst_scalar_from_fr,
+    blst_p1_add, blst_p1_affine, blst_p1_affine_in_g1, blst_p1_affine_is_inf, blst_p1_compress,
+    blst_p1_from_affine, blst_p1_mult, blst_p1_to_affine, blst_p1_uncompress, blst_p2,
+    blst_p2_affine, blst_p2_affine_in_g2, blst_p2_affine_is_inf, blst_p2_compress,
+    blst_p2_generator, blst_p2_mult, blst_p2_to_affine, blst_p2_uncompress,
+    blst_scalar, blst_scalar_from_be_bytes, blst_scalar_from_fr, blst_sha256, BLST_ERROR,
 };
 
+/// Hash-to-curve DST for the v3 G1 random-oracle suite (NUT-00). ASCII bytes,
+/// no trailing null.
 const DST: &[u8] = b"CASHU_BLS12_381_G1_XMD:SHA-256_SSWU_RO_";
 
+/// Domain-separation tag for the batch-verification Fiat-Shamir transcript
+/// (NUT-00 §Batch Verification).
+const BLS_BATCH_DST: &[u8] = b"Cashu_BLS_Batch_v1";
+
+/// BLS12-381 Fr (G1 scalar-field) order, big-endian:
+///   52435875175126190479447740508185965837690552500527637822603658699938581184513
+/// A 32-byte hash is a valid batch weight iff it is non-zero and strictly less
+/// than this (NUT-00 rejection sampling). We compare against it directly:
+/// `blst_scalar_from_be_bytes` *reduces* an out-of-range input modulo the order
+/// (and reports success), so its boolean cannot serve as the `< order` test.
+const BLS_FR_ORDER_BE: [u8; 32] = [
+    0x73, 0xed, 0xa7, 0x53, 0x29, 0x9d, 0x7d, 0x48, //
+    0x33, 0x39, 0xd8, 0x08, 0x09, 0xa1, 0xd8, 0x05, //
+    0x53, 0xbd, 0xa4, 0x02, 0xff, 0xfe, 0x5b, 0xfe, //
+    0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+];
+
+/// Big-endian 256-bit comparison `a < b`.
+fn be_lt(a: &[u8; 32], b: &[u8; 32]) -> bool {
+    for i in 0..32 {
+        if a[i] != b[i] {
+            return a[i] < b[i];
+        }
+    }
+    false
+}
+
 // ---------------------------------------------------------------------------
 // Heap (same shape as the previous bench).
 // ---------------------------------------------------------------------------
@@ -150,6 +192,25 @@ fn det_scalar(seed: u64) -> blst_scalar {
     s
 }
 
+/// Build the canonical `blst_scalar` for a small integer `n` (test vectors).
+fn small_scalar(n: u8) -> blst_scalar {
+    let mut be = [0u8; 32];
+    be[31] = n;
+    let mut s = blst_scalar::default();
+    unsafe { blst_scalar_from_be_bytes(&mut s, be.as_ptr(), be.len()) };
+    s
+}
+
+/// Lowercase hex of a byte slice (for comparing against spec test vectors).
+fn to_hex(bytes: &[u8]) -> alloc::string::String {
+    use core::fmt::Write;
+    let mut out = alloc::string::String::with_capacity(bytes.len() * 2);
+    for b in bytes {
+        let _ = write!(out, "{:02x}", b);
+    }
+    out
+}
+
 fn fr_from_scalar(s: &blst_scalar) -> blst_fr {
     let mut fr = blst_fr::default();
     unsafe { blst_fr_from_scalar(&mut fr, s) };
@@ -208,13 +269,6 @@ fn p1_mul(p: &blst_p1, scalar: &blst_scalar) -> blst_p1 {
     out
 }
 
-fn p2_mul(p: &blst_p2, scalar: &blst_scalar) -> blst_p2 {
-    let bytes = scalar_to_le(scalar);
-    let mut out = blst_p2::default();
-    unsafe { blst_p2_mult(&mut out, p, bytes.as_ptr(), 256) };
-    out
-}
-
 fn p1_add(a: &blst_p1, b: &blst_p1) -> blst_p1 {
     let mut out = blst_p1::default();
     unsafe { blst_p1_add(&mut out, a, b) };
@@ -234,130 +288,248 @@ fn p2_to_affine(p: &blst_p2) -> blst_p2_affine {
 }
 
 // ---------------------------------------------------------------------------
-// Cashu-on-BLS primitives, matching nutshell's b_dhke.py.
+// Serialisation + point validation (NUT-00 §Point Validation, CRITICAL).
+//
+// Every *received* point is deserialised from canonical compressed bytes and
+// rejected unless it is on-curve, not the identity, and in the prime-order
+// subgroup. `blst_p{1,2}_uncompress` already enforces canonical encoding +
+// on-curve; we add the infinity and subgroup (`in_g1`/`in_g2`) checks.
+// ---------------------------------------------------------------------------
+
+fn compress_g1(p: &blst_p1) -> [u8; 48] {
+    let mut out = [0u8; 48];
+    unsafe { blst_p1_compress(out.as_mut_ptr(), p) };
+    out
+}
+
+fn compress_g2(p: &blst_p2) -> [u8; 96] {
+    let mut out = [0u8; 96];
+    unsafe { blst_p2_compress(out.as_mut_ptr(), p) };
+    out
+}
+
+/// Validate a received G1 point (`B_`, `C_`, `C`): canonical compressed decode
+/// → on-curve → not identity → in the prime-order subgroup. `None` on any
+/// rejection.
+fn validate_g1(comp: &[u8; 48]) -> Option<blst_p1_affine> {
+    let mut aff = blst_p1_affine::default();
+    let err = unsafe { blst_p1_uncompress(&mut aff, comp.as_ptr()) };
+    if err != BLST_ERROR::BLST_SUCCESS {
+        return None; // non-canonical encoding or off-curve
+    }
+    if unsafe { blst_p1_affine_is_inf(&aff) } {
+        return None; // identity (point at infinity)
+    }
+    if !unsafe { blst_p1_affine_in_g1(&aff) } {
+        return None; // not in the prime-order subgroup
+    }
+    Some(aff)
+}
+
+/// Validate a received G2 point (the mint key `K`).
+fn validate_g2(comp: &[u8; 96]) -> Option<blst_p2_affine> {
+    let mut aff = blst_p2_affine::default();
+    let err = unsafe { blst_p2_uncompress(&mut aff, comp.as_ptr()) };
+    if err != BLST_ERROR::BLST_SUCCESS {
+        return None;
+    }
+    if unsafe { blst_p2_affine_is_inf(&aff) } {
+        return None;
+    }
+    if !unsafe { blst_p2_affine_in_g2(&aff) } {
+        return None;
+    }
+    Some(aff)
+}
+
+/// Validate + decompress a received G1 point to projective form.
+fn g1_from_comp(comp: &[u8; 48]) -> Option<blst_p1> {
+    let aff = validate_g1(comp)?;
+    let mut p = blst_p1::default();
+    unsafe { blst_p1_from_affine(&mut p, &aff) };
+    Some(p)
+}
+
+// ---------------------------------------------------------------------------
+// Cashu-on-BLS v3 primitives (NUT-00 §Pairing-based BDHKE). Each timed op
+// takes the wire (compressed) bytes the corresponding actor would receive,
+// validates them, and returns the bytes it would send on — so the spec's
+// mandatory point validation is inside the measured cost.
 // ---------------------------------------------------------------------------
 
-/// `step1_alice`: `B' = r · Y` where `Y = hash_to_g1(secret)`.
-fn step1_alice(secret: &[u8], r: &blst_scalar) -> blst_p1 {
+/// `step1_alice` (wallet): `B_ = r · Y`, `Y = hash_to_g1(secret)`. Returns the
+/// compressed `B_` the wallet sends to the mint. (No received point to check.)
+fn step1_alice(secret: &[u8], r: &blst_scalar) -> [u8; 48] {
     let y = hash_to_g1(secret);
-    p1_mul(&y, r)
+    compress_g1(&p1_mul(&y, r))
 }
 
-/// `step2_bob`: `C' = a · B'` (mint side, just one G1 mul).
-fn step2_bob(b_blinded: &blst_p1, sk: &blst_scalar) -> blst_p1 {
-    p1_mul(b_blinded, sk)
+/// `step2_bob` (mint): validate the received `B_`, then `C_ = a · B_`. Returns
+/// compressed `C_`.
+fn step2_bob(b_comp: &[u8; 48], a: &blst_scalar) -> [u8; 48] {
+    let b_blinded = g1_from_comp(b_comp).expect("invalid B_ (NUT-00 point validation)");
+    compress_g1(&p1_mul(&b_blinded, a))
 }
 
-/// `step3_alice`: `C = r⁻¹ · C'`.
-fn step3_alice(c_blinded: &blst_p1, r: &blst_scalar) -> blst_p1 {
+/// `step3_alice` (wallet): validate the received `C_`, then `C = r⁻¹ · C_`.
+/// Returns compressed `C`.
+fn step3_alice(c_blinded_comp: &[u8; 48], r: &blst_scalar) -> [u8; 48] {
+    let c_blinded = g1_from_comp(c_blinded_comp).expect("invalid C_ (NUT-00 point validation)");
     let r_inv = invert_scalar(r);
-    p1_mul(c_blinded, &r_inv)
+    compress_g1(&p1_mul(&c_blinded, &r_inv))
 }
 
-/// Mint-side `keyed_verification`: `C == a · Y`.
-fn keyed_verification(sk: &blst_scalar, c: &blst_p1, secret: &[u8]) -> bool {
-    let y = hash_to_g1(secret);
-    let expected = p1_mul(&y, sk);
-    // Equality: compress and compare bytes (constant-time enough for a bench).
-    let mut a = [0u8; 48];
-    let mut b = [0u8; 48];
-    unsafe {
-        blst_p1_compress(a.as_mut_ptr(), &expected);
-        blst_p1_compress(b.as_mut_ptr(), c);
+/// Mint-side `keyed_verification`: validate the received `C`, then check
+/// `C == a · Y`.
+fn keyed_verification(a: &blst_scalar, c_comp: &[u8; 48], secret: &[u8]) -> bool {
+    if validate_g1(c_comp).is_none() {
+        return false;
     }
-    a == b
+    let y = hash_to_g1(secret);
+    let expected = compress_g1(&p1_mul(&y, a));
+    &expected == c_comp
 }
 
-/// `pairing_verification`: `e(C, G2) == e(Y, K2)`. Two miller loops, one
-/// final-verify. This is the wallet's per-proof verification path.
-fn pairing_verification(k2: &blst_p2, c: &blst_p1, secret: &[u8]) -> bool {
+/// `pairing_verification` (wallet): validate the received `K` and `C`, then
+/// check `e(C, G2) == e(Y, K)`. Two miller loops, one final-verify.
+fn pairing_verification(k_comp: &[u8; 96], c_comp: &[u8; 48], secret: &[u8]) -> bool {
+    let c_aff = match validate_g1(c_comp) {
+        Some(a) => a,
+        None => return false,
+    };
+    let k_aff = match validate_g2(k_comp) {
+        Some(a) => a,
+        None => return false,
+    };
     let y = hash_to_g1(secret);
-    let g2 = unsafe { *blst_p2_generator() };
-
-    let c_aff = p1_to_affine(c);
     let y_aff = p1_to_affine(&y);
-    let g2_aff = p2_to_affine(&g2);
-    let k2_aff = p2_to_affine(k2);
+    let g2_aff = p2_to_affine(unsafe { &*blst_p2_generator() });
 
     let mut ml1 = blst_fp12::default();
     let mut ml2 = blst_fp12::default();
     unsafe {
         blst_miller_loop(&mut ml1, &g2_aff, &c_aff); // e(C, G2)
-        blst_miller_loop(&mut ml2, &k2_aff, &y_aff); // e(Y, K2)
+        blst_miller_loop(&mut ml2, &k_aff, &y_aff); // e(Y, K)
         blst_fp12_finalverify(&ml1, &ml2)
     }
 }
 
-/// `batch_pairing_verification`: random-linear-combination batch verify.
-/// Mirrors the fork's algorithm — random scalar rs[i], grouped by unique K2.
-fn batch_pairing_verification(
-    k2s: &[blst_p2],
-    cs: &[blst_p1],
-    secrets: &[&[u8]],
-    rs: &[blst_scalar],
-) -> bool {
+/// Derive the per-proof batch weights `r_i ∈ Fr*` deterministically, per
+/// NUT-00 §Batch Verification:
+///
+///   transcript = BLS_BATCH_DST || for each i: C_i(48) || K_i(96)
+///                                 || u32_BE(len secret_i) || secret_i
+///   challenge  = SHA256(transcript)
+///   r_i        = first SHA256(challenge || u32_BE(i) || u32_BE(ctr)), ctr=0,1,…
+///                whose big-endian value is in (0, BLS_FR_ORDER)  [rejection sampling]
+///
+/// The compressed `C_i`/`K_i` passed in are exactly the transcript bytes.
+fn derive_batch_weights(cs: &[[u8; 48]], ks: &[[u8; 96]], secrets: &[&[u8]]) -> Vec<blst_scalar> {
+    let n = cs.len();
+
+    let mut transcript: Vec<u8> = Vec::with_capacity(BLS_BATCH_DST.len() + n * 152);
+    transcript.extend_from_slice(BLS_BATCH_DST);
+    for i in 0..n {
+        transcript.extend_from_slice(&cs[i]);
+        transcript.extend_from_slice(&ks[i]);
+        transcript.extend_from_slice(&(secrets[i].len() as u32).to_be_bytes());
+        transcript.extend_from_slice(secrets[i]);
+    }
+    let mut challenge = [0u8; 32];
+    unsafe { blst_sha256(challenge.as_mut_ptr(), transcript.as_ptr(), transcript.len()) };
+
+    let mut weights = Vec::with_capacity(n);
+    for i in 0..n {
+        let mut ctr: u32 = 0;
+        loop {
+            let mut buf = [0u8; 40]; // 32 (challenge) + 4 (i) + 4 (ctr)
+            buf[0..32].copy_from_slice(&challenge);
+            buf[32..36].copy_from_slice(&(i as u32).to_be_bytes());
+            buf[36..40].copy_from_slice(&ctr.to_be_bytes());
+
+            let mut h = [0u8; 32];
+            unsafe { blst_sha256(h.as_mut_ptr(), buf.as_ptr(), buf.len()) };
+
+            // NUT-00 rejection sampling: accept iff 0 < OS2IP(h) < BLS_FR_ORDER.
+            let is_zero = h.iter().all(|&b| b == 0);
+            if !is_zero && be_lt(&h, &BLS_FR_ORDER_BE) {
+                // h is already canonical (< order), so the scalar value is h.
+                let mut s = blst_scalar::default();
+                unsafe { blst_scalar_from_be_bytes(&mut s, h.as_ptr(), 32) };
+                weights.push(s);
+                break;
+            }
+            ctr += 1;
+        }
+    }
+    weights
+}
+
+/// `batch_pairing_verification` (NUT-00 §Batch Verification): validate every
+/// received `C_i`/`K_i`, derive the Fiat-Shamir weights, then check the single
+/// multi-pairing
+///
+///   e( Σ r_i·C_i , G2 ) == Π_k e( Σ_{i:K_i=K_k} r_i·Y_i , K_k )
+///
+/// grouped by distinct mint key. One final-verify for the whole equation.
+fn batch_pairing_verification(ks: &[[u8; 96]], cs: &[[u8; 48]], secrets: &[&[u8]]) -> bool {
     let n = cs.len();
     if n == 0 {
         return true;
     }
-    debug_assert_eq!(k2s.len(), n);
+    debug_assert_eq!(ks.len(), n);
     debug_assert_eq!(secrets.len(), n);
-    debug_assert_eq!(rs.len(), n);
 
-    // sum_C = Σ rs[i] · Cs[i]
-    let mut sum_c = p1_mul(&cs[0], &rs[0]);
+    let weights = derive_batch_weights(cs, ks, secrets);
+
+    // sum_C = Σ r_i · C_i  (each C_i validated as it is decompressed).
+    let mut sum_c = match g1_from_comp(&cs[0]) {
+        Some(c) => p1_mul(&c, &weights[0]),
+        None => return false,
+    };
     for i in 1..n {
-        let term = p1_mul(&cs[i], &rs[i]);
-        sum_c = p1_add(&sum_c, &term);
+        match g1_from_comp(&cs[i]) {
+            Some(c) => sum_c = p1_add(&sum_c, &p1_mul(&c, &weights[i])),
+            None => return false,
+        }
     }
 
-    // Group rs[i]·Y(secrets[i]) by unique K2 key. We compare K2s by
-    // compressed bytes (96 bytes for G2). We expect at most a handful of
-    // unique keys in practice (one per binary denomination).
-    let mut groups: Vec<([u8; 192], blst_p2_affine, blst_p1)> = Vec::with_capacity(n);
+    // Group Σ r_i·Y_i by distinct mint key K (keyed on its 96-byte compressed
+    // form — at most one per binary denomination in practice). Each distinct K
+    // is validated once, when first seen: keyset keys are validated once at
+    // load and cached, so re-checking the same key per proof isn't realistic.
+    let mut groups: Vec<([u8; 96], blst_p2_affine, blst_p1)> = Vec::with_capacity(n);
     for i in 0..n {
-        let y = hash_to_g1(secrets[i]);
-        let r_y = p1_mul(&y, &rs[i]);
-
-        // Serialize K2 to a key. (Affine encoding is 96 bytes; we use 192
-        // here just to keep the type aligned to a fixed size.)
-        let k2_aff_i = p2_to_affine(&k2s[i]);
-        let mut key_bytes = [0u8; 192];
-        let aff_ptr = &k2_aff_i as *const blst_p2_affine as *const u8;
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                aff_ptr,
-                key_bytes.as_mut_ptr(),
-                size_of::<blst_p2_affine>(),
-            );
-        }
-
-        if let Some(existing) = groups.iter_mut().find(|(k, _, _)| *k == key_bytes) {
+        let r_y = p1_mul(&hash_to_g1(secrets[i]), &weights[i]);
+        if let Some(existing) = groups.iter_mut().find(|(k, _, _)| *k == ks[i]) {
             existing.2 = p1_add(&existing.2, &r_y);
         } else {
-            groups.push((key_bytes, k2_aff_i, r_y));
+            let k_aff = match validate_g2(&ks[i]) {
+                Some(a) => a,
+                None => return false,
+            };
+            groups.push((ks[i], k_aff, r_y));
         }
     }
 
-    // Left side: e(sum_C, G2) — miller_loop only
+    // Left: e(Σ r_i·C_i, G2) — miller loop only.
     let g2_aff = p2_to_affine(unsafe { &*blst_p2_generator() });
     let sum_c_aff = p1_to_affine(&sum_c);
     let mut left_ml = blst_fp12::default();
     unsafe { blst_miller_loop(&mut left_ml, &g2_aff, &sum_c_aff) };
 
-    // Right side: ∏ over unique keys: miller_loop(K2_j, sum_rY_j)
+    // Right: ∏ over distinct keys e(Σ r_i·Y_i, K_k) — miller loops, multiplied.
     let mut right_ml = unsafe { *blst_fp12_one() };
-    for (_k, k2_aff, sum_ry) in &groups {
+    for (_k, k_aff, sum_ry) in &groups {
         let sum_ry_aff = p1_to_affine(sum_ry);
         let mut ml = blst_fp12::default();
-        unsafe { blst_miller_loop(&mut ml, k2_aff, &sum_ry_aff) };
+        unsafe { blst_miller_loop(&mut ml, k_aff, &sum_ry_aff) };
         let mut combined = blst_fp12::default();
         unsafe { blst_fp12_mul(&mut combined, &right_ml, &ml) };
         right_ml = combined;
     }
 
-    // final_exp(left) == final_exp(right) — fp12_finalverify is the optimised
-    // form (effectively one final_exp instead of two).
+    // One final_exp for the whole equation (finalverify is the optimised form).
     unsafe { blst_fp12_finalverify(&left_ml, &right_ml) }
 }
 
@@ -365,62 +537,137 @@ fn batch_pairing_verification(
 // Setup helpers for the batch bench.
 // ---------------------------------------------------------------------------
 
-/// Make N proofs all under the same mint key.
+/// Make N proofs all under the same mint key `k_comp` (compressed `K`).
 fn build_proofs_same_key(
     n: usize,
-    k2: &blst_p2,
-    sk: &blst_scalar,
-) -> (Vec<blst_p1>, Vec<alloc::string::String>, Vec<blst_p2>, Vec<blst_scalar>) {
+    k_comp: &[u8; 96],
+    a: &blst_scalar,
+) -> (Vec<[u8; 48]>, Vec<alloc::string::String>, Vec<[u8; 96]>) {
     use alloc::format;
     let mut cs = Vec::with_capacity(n);
     let mut secrets = Vec::with_capacity(n);
-    let mut k2s = Vec::with_capacity(n);
-    let mut rs = Vec::with_capacity(n);
+    let mut ks = Vec::with_capacity(n);
     for i in 0..n {
         let secret = format!("proof-{i}");
         let r = det_scalar(0x1000 + i as u64);
         let b_ = step1_alice(secret.as_bytes(), &r);
-        let c_ = step2_bob(&b_, sk);
+        let c_ = step2_bob(&b_, a);
         let c = step3_alice(&c_, &r);
 
         cs.push(c);
         secrets.push(secret);
-        k2s.push(*k2);
-        // random scalar for batch verification (the rs the verifier picks,
-        // not the wallet's blinding factor)
-        rs.push(det_scalar(0x2000 + i as u64));
+        ks.push(*k_comp);
     }
-    (cs, secrets, k2s, rs)
+    (cs, secrets, ks)
 }
 
-/// Make N proofs each under a *different* mint key (so the RHS has N unique
+/// Make N proofs each under a *different* mint key (so the RHS has N distinct
 /// miller loops). Mirrors the worst case for batch verification.
 fn build_proofs_distinct_keys(
     n: usize,
-) -> (Vec<blst_p1>, Vec<alloc::string::String>, Vec<blst_p2>, Vec<blst_scalar>) {
+) -> (Vec<[u8; 48]>, Vec<alloc::string::String>, Vec<[u8; 96]>) {
     use alloc::format;
     let mut cs = Vec::with_capacity(n);
     let mut secrets = Vec::with_capacity(n);
-    let mut k2s = Vec::with_capacity(n);
-    let mut rs = Vec::with_capacity(n);
+    let mut ks = Vec::with_capacity(n);
     for i in 0..n {
-        let sk = det_scalar(0x3000 + i as u64);
-        let mut k2 = blst_p2::default();
-        let sk_be = scalar_to_le(&sk);
-        unsafe { blst_p2_mult(&mut k2, blst_p2_generator(), sk_be.as_ptr(), 256) };
+        let a = det_scalar(0x3000 + i as u64);
+        let mut k = blst_p2::default();
+        let a_le = scalar_to_le(&a);
+        unsafe { blst_p2_mult(&mut k, blst_p2_generator(), a_le.as_ptr(), 256) };
+        let k_comp = compress_g2(&k);
 
         let secret = format!("proof-d-{i}");
         let r = det_scalar(0x4000 + i as u64);
         let b_ = step1_alice(secret.as_bytes(), &r);
-        let c_ = step2_bob(&b_, &sk);
+        let c_ = step2_bob(&b_, &a);
         let c = step3_alice(&c_, &r);
 
         cs.push(c);
         secrets.push(secret);
-        k2s.push(k2);
-        rs.push(det_scalar(0x5000 + i as u64));
+        ks.push(k_comp);
     }
-    (cs, secrets, k2s, rs)
+    (cs, secrets, ks)
+}
+
+// ---------------------------------------------------------------------------
+// Spec-conformance gate — NUT-00 (nuts#371) test vectors (tests/00-tests.md).
+// Proves this implementation matches the spec byte-for-byte. Prints `match=…`
+// per field; the run is only meaningful if every line is true.
+// ---------------------------------------------------------------------------
+fn spec_conformance() {
+    println!();
+    println!("--- spec-conformance: NUT-00 test vectors (nuts#371) ---");
+
+    // ---- single-proof round-trip (Test 1): secret "test_message", r=3, a=2.
+    let secret: &[u8] = b"test_message";
+    let r = small_scalar(3);
+    let a = small_scalar(2);
+
+    let y = compress_g1(&hash_to_g1(secret));
+    let mut k = blst_p2::default();
+    let a_le = scalar_to_le(&a);
+    unsafe { blst_p2_mult(&mut k, blst_p2_generator(), a_le.as_ptr(), 256) };
+    let k_comp = compress_g2(&k);
+
+    let b_ = step1_alice(secret, &r);
+    let c_ = step2_bob(&b_, &a);
+    let c = step3_alice(&c_, &r);
+
+    const Y_EXP: &str = "860d58e5aeda1376185436ed96412313424cc079e056d1dab595e6db4c2c9685fec7da052c8db68d88985b75a42388ad";
+    const K_EXP: &str = "aa4edef9c1ed7f729f520e47730a124fd70662a904ba1074728114d1031e1572c6c886f6b57ec72a6178288c47c335771638533957d540a9d2370f17cc7ed5863bc0b995b8825e0ee1ea1e1e4d00dbae81f14b0bf3611b78c952aacab827a053";
+    const B_EXP: &str = "8e88c5f6a93f653784a66b033a00e52128499e18b095c2a56f080d1c2a937ffc9ef4600804a48d087bbd1f662f6b068f";
+    const C_BLINDED_EXP: &str = "8d52d7a6cbe5e99858d5c15c092d11a0c387c78917471211082a6e5afc2a79680dfa188fafe5d4a51c5398ce160e7a16";
+    const C_EXP: &str = "b7a4881059133fd91a8753600d9a5e524c65d6224f6fe2d5aef9e59f1507fdad90b3b4d48ee46da5c8dfaa0b88e28b69";
+
+    println!("  Y  match={}", to_hex(&y) == Y_EXP);
+    println!("  K  match={}", to_hex(&k_comp) == K_EXP);
+    println!("  B_ match={}", to_hex(&b_) == B_EXP);
+    println!("  C_ match={}", to_hex(&c_) == C_BLINDED_EXP);
+    println!("  C  match={}", to_hex(&c) == C_EXP);
+    println!("  pairing_verification={}", pairing_verification(&k_comp, &c, secret));
+
+    // ---- batch verification: two proofs under K = 2·G2.
+    let secret1: &[u8] = b"batch_proof_1";
+    let secret2: &[u8] = b"batch_proof_2";
+    let r1 = small_scalar(5);
+    let r2 = small_scalar(7);
+    let c1 = step3_alice(&step2_bob(&step1_alice(secret1, &r1), &a), &r1);
+    let c2 = step3_alice(&step2_bob(&step1_alice(secret2, &r2), &a), &r2);
+
+    const C1_EXP: &str = "acebf797506a7031cef3189904715cb22792528f1ea0e6ab25341401d245539438ed97122f00e38ee6185cc20b09ba11";
+    const C2_EXP: &str = "9776497ad47a00f8a56233fb88f939b0572cf174a4c6d2446c0b1060434e305fae6845fd1f68b70376ba53ffe67f0414";
+    println!("  C_1 match={}", to_hex(&c1) == C1_EXP);
+    println!("  C_2 match={}", to_hex(&c2) == C2_EXP);
+
+    let cs = [c1, c2];
+    let ks = [k_comp, k_comp];
+    let secrets: [&[u8]; 2] = [secret1, secret2];
+
+    // Recompute the challenge the same way as derive_batch_weights, to check
+    // the transcript layout directly.
+    let mut transcript: Vec<u8> = Vec::new();
+    transcript.extend_from_slice(BLS_BATCH_DST);
+    for i in 0..2 {
+        transcript.extend_from_slice(&cs[i]);
+        transcript.extend_from_slice(&ks[i]);
+        transcript.extend_from_slice(&(secrets[i].len() as u32).to_be_bytes());
+        transcript.extend_from_slice(secrets[i]);
+    }
+    let mut challenge = [0u8; 32];
+    unsafe { blst_sha256(challenge.as_mut_ptr(), transcript.as_ptr(), transcript.len()) };
+
+    let weights = derive_batch_weights(&cs, &ks, &secrets);
+    let w1 = scalar_to_be(&weights[0]);
+    let w2 = scalar_to_be(&weights[1]);
+
+    const CHALLENGE_EXP: &str = "539b5df396e82adab0760459590d38122d2552bc74f6bd860e915ff3b95e550a";
+    const W1_EXP: &str = "0e7ff8be2ccb756d4ef390991bdd77eb65e8db624a2729fa1657c3cf8d7d4b55";
+    const W2_EXP: &str = "6d026a181a6215b233e73b121d01908a1a1eb6911955bea5130bbf2f2966554d";
+    println!("  challenge match={}", to_hex(&challenge) == CHALLENGE_EXP);
+    println!("  weight_1  match={}", to_hex(&w1) == W1_EXP);
+    println!("  weight_2  match={}", to_hex(&w2) == W2_EXP);
+    println!("  batch verify={}", batch_pairing_verification(&ks, &cs, &secrets));
 }
 
 // ---------------------------------------------------------------------------
@@ -612,30 +859,34 @@ fn main() -> ! {
 
     println!();
     println!("===============================================================");
-    println!("nutshell-fork-shaped Cashu-on-BLS, ESP32-C3 @ 160 MHz, blst");
+    println!("NUT-00 BLS12-381 (v3) Cashu-on-BLS, ESP32-C3 @ 160 MHz, blst");
+    println!("(cashubtc/nuts#371)");
     println!("===============================================================");
 
     // --- shared setup ---
-    println!("setup: mint key");
-    let sk = det_scalar(0xCA5);
-    let mut k2 = blst_p2::default();
-    let sk_be = scalar_to_le(&sk);
-    unsafe { blst_p2_mult(&mut k2, blst_p2_generator(), sk_be.as_ptr(), 256) };
+    println!("setup: mint key K = a·G2");
+    let a = det_scalar(0xCA5); // mint secret scalar (spec: `a`)
+    let mut k2 = blst_p2::default(); // K (named k2 to signal the G2 group)
+    let a_le = scalar_to_le(&a);
+    unsafe { blst_p2_mult(&mut k2, blst_p2_generator(), a_le.as_ptr(), 256) };
+    let k_comp = compress_g2(&k2);
 
     println!("setup: one issued proof");
     let secret_one: &[u8] = b"proof-single-test";
     let r_one = det_scalar(0xBEEF);
-    let b_one = step1_alice(secret_one, &r_one);
-    let c_blinded_one = step2_bob(&b_one, &sk);
+    let b_comp_one = step1_alice(secret_one, &r_one);
+    let c_blinded_one = step2_bob(&b_comp_one, &a);
     let c_one = step3_alice(&c_blinded_one, &r_one);
+    let arb_g1 = g1_from_comp(&b_comp_one).expect("setup B_ must validate");
 
     println!("setup: done");
 
-    // Correctness gate: with MPI-backed mul_mont these must still hold. If the
-    // peripheral path were wrong, blst's arithmetic would be garbage and these
-    // would be false (or the issued proof wouldn't round-trip).
-    let kv_ok = keyed_verification(&sk, &c_one, secret_one);
-    let pv_ok = pairing_verification(&k2, &c_one, secret_one);
+    // Spec-conformance gate (NUT-00 test vectors) + a quick round-trip on the
+    // issued proof. With MPI-backed mul_mont these must hold; a wrong
+    // peripheral path would make blst's arithmetic garbage and fail them.
+    spec_conformance();
+    let kv_ok = keyed_verification(&a, &c_one, secret_one);
+    let pv_ok = pairing_verification(&k_comp, &c_one, secret_one);
     println!("correctness: keyed_verification={kv_ok}  pairing_verification={pv_ok}");
 
     println!();
@@ -647,34 +898,43 @@ fn main() -> ! {
     time_it(format_args!("hash_to_g1 (RFC 9380 SSWU)"), 16, || {
         let _ = black_box(hash_to_g1(black_box(secret_one)));
     });
-    // One G1 scalar mul of an arbitrary point (= step2_bob, isolated here so
-    // step1_alice ≈ hash_to_g1 + g1_scalar_mul can be checked directly).
+    // One G1 scalar mul of an arbitrary point (the bare multiply underneath the
+    // BDHKE steps, isolated so step1_alice ≈ hash_to_g1 + g1_scalar_mul holds).
     time_it(format_args!("g1_scalar_mul (arbitrary point)"), 16, || {
-        let _ = black_box(p1_mul(black_box(&b_one), black_box(&sk)));
+        let _ = black_box(p1_mul(black_box(&arb_g1), black_box(&a)));
+    });
+    // Mandatory NUT-00 point validation: uncompress + is_inf + in_g{1,2}. The
+    // per-op surcharge every received C/B_/C_ (G1) and K (G2) now pays.
+    time_it(format_args!("point_validate_g1 (uncompress+in_g1)"), 16, || {
+        let _ = black_box(validate_g1(black_box(&c_one)));
+    });
+    time_it(format_args!("point_validate_g2 (uncompress+in_g2)"), 16, || {
+        let _ = black_box(validate_g2(black_box(&k_comp)));
     });
 
     println!();
-    println!("--- per-proof BDHKE ops (multiplicative blinding) ---");
+    println!("--- per-proof BDHKE ops (multiplicative blinding, v3) ---");
+    println!("    (mint/wallet ops include the NUT-00 point validation they owe)");
 
     time_it(format_args!("step1_alice (blind)"), 8, || {
         let _ = black_box(step1_alice(black_box(secret_one), black_box(&r_one)));
     });
-    time_it(format_args!("step2_bob   (mint sign)"), 8, || {
-        let _ = black_box(step2_bob(black_box(&b_one), black_box(&sk)));
+    time_it(format_args!("step2_bob   (validate B_ + sign)"), 8, || {
+        let _ = black_box(step2_bob(black_box(&b_comp_one), black_box(&a)));
     });
-    time_it(format_args!("step3_alice (unblind = inv + mul)"), 8, || {
+    time_it(format_args!("step3_alice (validate C_ + unblind)"), 8, || {
         let _ = black_box(step3_alice(black_box(&c_blinded_one), black_box(&r_one)));
     });
     time_it(format_args!("keyed_verification (mint side)"), 8, || {
         let _ = black_box(keyed_verification(
-            black_box(&sk),
+            black_box(&a),
             black_box(&c_one),
             black_box(secret_one),
         ));
     });
     time_it(format_args!("bdhke_full_round"), 5, || {
         let b_ = step1_alice(black_box(secret_one), black_box(&r_one));
-        let c_ = step2_bob(&b_, black_box(&sk));
+        let c_ = step2_bob(&b_, black_box(&a));
         let _ = black_box(step3_alice(&c_, black_box(&r_one)));
     });
 
@@ -683,40 +943,38 @@ fn main() -> ! {
 
     time_it(format_args!("pairing_verification (1 proof)"), 5, || {
         let _ = black_box(pairing_verification(
-            black_box(&k2),
+            black_box(&k_comp),
             black_box(&c_one),
             black_box(secret_one),
         ));
     });
 
     println!();
-    println!("--- batch_pairing_verification, all proofs same K2 ---");
+    println!("--- batch_pairing_verification, all proofs same K ---");
 
     for &n in &[1usize, 2, 4, 8, 16, 32] {
-        let (cs, secrets, k2s, rs) = build_proofs_same_key(n, &k2, &sk);
+        let (cs, secrets, ks) = build_proofs_same_key(n, &k_comp, &a);
         let secret_refs: Vec<&[u8]> = secrets.iter().map(|s| s.as_bytes()).collect();
         time_it(format_args!("batch_verify_same_key  N={n:<2}"), 3, || {
             let _ = black_box(batch_pairing_verification(
-                black_box(&k2s),
+                black_box(&ks),
                 black_box(&cs),
                 black_box(&secret_refs),
-                black_box(&rs),
             ));
         });
     }
 
     println!();
-    println!("--- batch_pairing_verification, every proof under a distinct K2 ---");
+    println!("--- batch_pairing_verification, every proof under a distinct K ---");
 
     for &n in &[1usize, 2, 4, 8, 16] {
-        let (cs, secrets, k2s, rs) = build_proofs_distinct_keys(n);
+        let (cs, secrets, ks) = build_proofs_distinct_keys(n);
         let secret_refs: Vec<&[u8]> = secrets.iter().map(|s| s.as_bytes()).collect();
         time_it(format_args!("batch_verify_distinct  N={n:<2}"), 3, || {
             let _ = black_box(batch_pairing_verification(
-                black_box(&k2s),
+                black_box(&ks),
                 black_box(&cs),
                 black_box(&secret_refs),
-                black_box(&rs),
             ));
         });
     }
diff --git a/legacy/README.md b/legacy/README.md
index 84b2fcf..d30c661 100644
--- a/legacy/README.md
+++ b/legacy/README.md
@@ -1,6 +1,6 @@
 # legacy/ — superseded
 
-These crates predate [cashubtc/nutshell PR #999][pr] as a reference. They mock
+These crates predate the [NUT-00 BLS12-381 (v3) spec][pr]. They mock
 an **additive**-blinding BDHKE on BLS12-381:
 
 ```
@@ -13,9 +13,9 @@ mint:    verify a·Y == C
 
 …plus a separate BLS-signature-style verify path with a G2 key (`pk = a·G2`).
 
-That's **not** what PR #999 ended up doing. PR #999 uses **multiplicative**
-blinding (`B' = r·Y`, `C = r⁻¹·C' = a·Y`, verify `e(C, G2) == e(Y, K2)` with
-`K2 = a·G2` on G2 only — 96-byte keyset keys). The additive scheme here would
+That's **not** what NUT-00 (v3) does. The spec uses **multiplicative**
+blinding (`B' = r·Y`, `C = r⁻¹·C' = a·Y`, verify `e(C, G2) == e(Y, K)` with
+`K = a·G2` on G2 only — 96-byte keyset keys). The additive scheme here would
 need the mint key published on **both** G1 (for the `− r·K` unblind) and G2
 (for the pairing check) = **144-byte** keyset keys. It also hashes with a
 placeholder DST and uses the pure-Rust `bls12_381` (zkcrypto) backend rather
@@ -24,9 +24,9 @@ than `blst`.
 **Kept only for the historical pure-Rust-vs-`blst` per-primitive comparison**
 (the ~9-40×-per-op gap; see the appendix in `../RESULTS.md`). For
 protocol-accurate numbers use `../esp32c3-bench-blst/` — that's the bench that
-ports PR #999 faithfully.
+implements NUT-00 (v3) faithfully.
 
-[pr]: https://github.com/cashubtc/nutshell/pull/999
+[pr]: https://github.com/cashubtc/nuts/pull/371
 
 - `crypto/` — the additive-blinding BDHKE mock (`no_std`, zkcrypto backend),
   shared by the two benches below.
diff --git a/legacy/crypto/src/lib.rs b/legacy/crypto/src/lib.rs
index bf28003..8b83306 100644
--- a/legacy/crypto/src/lib.rs
+++ b/legacy/crypto/src/lib.rs
@@ -1,16 +1,16 @@
 //! ╔══════════════════════════════════════════════════════════════════════╗
 //! ║ SUPERSEDED — do not use for protocol-accurate numbers.               ║
 //! ╠══════════════════════════════════════════════════════════════════════╣
-//! ║ This crate predates cashubtc/nutshell PR #999 as a reference. It     ║
-//! ║ mocks an *additive*-blinding BDHKE (`B' = Y + r·G`, `C = C' − r·K`), ║
-//! ║ which is NOT what PR #999 does — the PR uses *multiplicative*        ║
-//! ║ blinding (`B' = r·Y`, `C = r⁻¹·C' = a·Y`). Additive blinding would  ║
-//! ║ force the mint key onto both G1 (for the `− r·K` unblind) and G2     ║
-//! ║ (for the pairing check) = 144-byte keyset keys; multiplicative needs ║
-//! ║ only `K2 = a·G2` on G2 (96 bytes).                                  ║
+//! ║ This crate predates the NUT-00 BLS12-381 (v3) spec. It mocks an      ║
+//! ║ *additive*-blinding BDHKE (`B' = Y + r·G`, `C = C' − r·K`), which is ║
+//! ║ NOT what NUT-00 (v3) does — the spec uses *multiplicative* blinding  ║
+//! ║ (`B' = r·Y`, `C = r⁻¹·C' = a·Y`). Additive blinding would force the  ║
+//! ║ mint key onto both G1 (for the `− r·K` unblind) and G2 (for the      ║
+//! ║ pairing check) = 144-byte keyset keys; multiplicative needs only     ║
+//! ║ `K = a·G2` on G2 (96 bytes).                                         ║
 //! ║                                                                      ║
 //! ║ Kept only for the historical pure-Rust (zkcrypto `bls12_381`) vs     ║
-//! ║ `blst` comparison. The current, PR-#999-accurate bench is the        ║
+//! ║ `blst` comparison. The current, NUT-00-v3-accurate bench is the      ║
 //! ║ `esp32c3-bench-blst` crate; see RESULTS.md.                          ║
 //! ╚══════════════════════════════════════════════════════════════════════╝
 //!
diff --git a/legacy/esp32c3-bench/src/main.rs b/legacy/esp32c3-bench/src/main.rs
index b171854..28041b4 100644
--- a/legacy/esp32c3-bench/src/main.rs
+++ b/legacy/esp32c3-bench/src/main.rs
@@ -1,7 +1,7 @@
 //! SUPERSEDED — runs the additive-blinding mock from the `crypto` crate, which
-//! does NOT match cashubtc/nutshell PR #999 (the PR uses multiplicative
+//! does NOT match the NUT-00 BLS12-381 (v3) spec (the spec uses multiplicative
 //! blinding; see `crypto/src/lib.rs` for the full note). Kept only for the
-//! historical pure-Rust-vs-`blst` comparison. The current, PR-#999-accurate
+//! historical pure-Rust-vs-`blst` comparison. The current, NUT-00-v3-accurate
 //! bench is the `esp32c3-bench-blst` crate.
 //!
 //! BLS-on-Cashu primitive benchmark for ESP32-C3 (pure-Rust zkcrypto backend).
diff --git a/legacy/host-bench/benches/cashu_bls.rs b/legacy/host-bench/benches/cashu_bls.rs
index 2c99942..3c8fa80 100644
--- a/legacy/host-bench/benches/cashu_bls.rs
+++ b/legacy/host-bench/benches/cashu_bls.rs
@@ -1,7 +1,7 @@
 //! SUPERSEDED — host baseline for the *additive-blinding* mock (the `crypto`
-//! crate), which does NOT match cashubtc/nutshell PR #999. Kept only for the
-//! historical pure-Rust-vs-`blst` comparison. See `crypto/src/lib.rs` and
-//! RESULTS.md; the current, PR-#999-accurate bench is `esp32c3-bench-blst`.
+//! crate), which does NOT match the NUT-00 BLS12-381 (v3) spec. Kept only for
+//! the historical pure-Rust-vs-`blst` comparison. See `crypto/src/lib.rs` and
+//! RESULTS.md; the current, NUT-00-v3-accurate bench is `esp32c3-bench-blst`.
 //!
 //! Host baseline for the BLS-on-Cashu primitive bench.
 //!